diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-11 08:27:49 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-11 08:27:49 +0000 |
commit | ace9429bb58fd418f0c81d4c2835699bddf6bde6 (patch) | |
tree | b2d64bc10158fdd5497876388cd68142ca374ed3 /arch/x86/mm | |
parent | Initial commit. (diff) | |
download | linux-ace9429bb58fd418f0c81d4c2835699bddf6bde6.tar.xz linux-ace9429bb58fd418f0c81d4c2835699bddf6bde6.zip |
Adding upstream version 6.6.15.upstream/6.6.15
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to '')
50 files changed, 21539 insertions, 0 deletions
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile new file mode 100644 index 0000000000..c80febc44c --- /dev/null +++ b/arch/x86/mm/Makefile @@ -0,0 +1,69 @@ +# SPDX-License-Identifier: GPL-2.0 +# Kernel does not boot with instrumentation of tlb.c and mem_encrypt*.c +KCOV_INSTRUMENT_tlb.o := n +KCOV_INSTRUMENT_mem_encrypt.o := n +KCOV_INSTRUMENT_mem_encrypt_amd.o := n +KCOV_INSTRUMENT_mem_encrypt_identity.o := n +KCOV_INSTRUMENT_pgprot.o := n + +KASAN_SANITIZE_mem_encrypt.o := n +KASAN_SANITIZE_mem_encrypt_amd.o := n +KASAN_SANITIZE_mem_encrypt_identity.o := n +KASAN_SANITIZE_pgprot.o := n + +# Disable KCSAN entirely, because otherwise we get warnings that some functions +# reference __initdata sections. +KCSAN_SANITIZE := n +# Avoid recursion by not calling KMSAN hooks for CEA code. +KMSAN_SANITIZE_cpu_entry_area.o := n + +ifdef CONFIG_FUNCTION_TRACER +CFLAGS_REMOVE_mem_encrypt.o = -pg +CFLAGS_REMOVE_mem_encrypt_amd.o = -pg +CFLAGS_REMOVE_mem_encrypt_identity.o = -pg +CFLAGS_REMOVE_pgprot.o = -pg +endif + +obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o mmap.o \ + pgtable.o physaddr.o tlb.o cpu_entry_area.o maccess.o pgprot.o + +obj-y += pat/ + +# Make sure __phys_addr has no stackprotector +CFLAGS_physaddr.o := -fno-stack-protector +CFLAGS_mem_encrypt_identity.o := -fno-stack-protector + +CFLAGS_fault.o := -I $(srctree)/$(src)/../include/asm/trace + +obj-$(CONFIG_X86_32) += pgtable_32.o iomap_32.o + +obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o +obj-$(CONFIG_PTDUMP_CORE) += dump_pagetables.o +obj-$(CONFIG_PTDUMP_DEBUGFS) += debug_pagetables.o + +obj-$(CONFIG_HIGHMEM) += highmem_32.o + +KASAN_SANITIZE_kasan_init_$(BITS).o := n +obj-$(CONFIG_KASAN) += kasan_init_$(BITS).o + +KMSAN_SANITIZE_kmsan_shadow.o := n +obj-$(CONFIG_KMSAN) += kmsan_shadow.o + +obj-$(CONFIG_MMIOTRACE) += mmiotrace.o +mmiotrace-y := kmmio.o pf_in.o mmio-mod.o +obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o + +obj-$(CONFIG_NUMA) += numa.o numa_$(BITS).o +obj-$(CONFIG_AMD_NUMA) += amdtopology.o +obj-$(CONFIG_ACPI_NUMA) += srat.o +obj-$(CONFIG_NUMA_EMU) += numa_emulation.o + +obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o +obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o +obj-$(CONFIG_PAGE_TABLE_ISOLATION) += pti.o + +obj-$(CONFIG_X86_MEM_ENCRYPT) += mem_encrypt.o +obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt_amd.o + +obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt_identity.o +obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt_boot.o diff --git a/arch/x86/mm/amdtopology.c b/arch/x86/mm/amdtopology.c new file mode 100644 index 0000000000..b3ca7d23e4 --- /dev/null +++ b/arch/x86/mm/amdtopology.c @@ -0,0 +1,183 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * AMD NUMA support. + * Discover the memory map and associated nodes. + * + * This version reads it directly from the AMD northbridge. + * + * Copyright 2002,2003 Andi Kleen, SuSE Labs. + */ +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/string.h> +#include <linux/nodemask.h> +#include <linux/memblock.h> + +#include <asm/io.h> +#include <linux/pci_ids.h> +#include <linux/acpi.h> +#include <asm/types.h> +#include <asm/mmzone.h> +#include <asm/proto.h> +#include <asm/e820/api.h> +#include <asm/pci-direct.h> +#include <asm/numa.h> +#include <asm/mpspec.h> +#include <asm/apic.h> +#include <asm/amd_nb.h> + +static unsigned char __initdata nodeids[8]; + +static __init int find_northbridge(void) +{ + int num; + + for (num = 0; num < 32; num++) { + u32 header; + + header = read_pci_config(0, num, 0, 0x00); + if (header != (PCI_VENDOR_ID_AMD | (0x1100<<16)) && + header != (PCI_VENDOR_ID_AMD | (0x1200<<16)) && + header != (PCI_VENDOR_ID_AMD | (0x1300<<16))) + continue; + + header = read_pci_config(0, num, 1, 0x00); + if (header != (PCI_VENDOR_ID_AMD | (0x1101<<16)) && + header != (PCI_VENDOR_ID_AMD | (0x1201<<16)) && + header != (PCI_VENDOR_ID_AMD | (0x1301<<16))) + continue; + return num; + } + + return -ENOENT; +} + +int __init amd_numa_init(void) +{ + u64 start = PFN_PHYS(0); + u64 end = PFN_PHYS(max_pfn); + unsigned numnodes; + u64 prevbase; + int i, j, nb; + u32 nodeid, reg; + unsigned int bits, cores, apicid_base; + + if (!early_pci_allowed()) + return -EINVAL; + + nb = find_northbridge(); + if (nb < 0) + return nb; + + pr_info("Scanning NUMA topology in Northbridge %d\n", nb); + + reg = read_pci_config(0, nb, 0, 0x60); + numnodes = ((reg >> 4) & 0xF) + 1; + if (numnodes <= 1) + return -ENOENT; + + pr_info("Number of physical nodes %d\n", numnodes); + + prevbase = 0; + for (i = 0; i < 8; i++) { + u64 base, limit; + + base = read_pci_config(0, nb, 1, 0x40 + i*8); + limit = read_pci_config(0, nb, 1, 0x44 + i*8); + + nodeids[i] = nodeid = limit & 7; + if ((base & 3) == 0) { + if (i < numnodes) + pr_info("Skipping disabled node %d\n", i); + continue; + } + if (nodeid >= numnodes) { + pr_info("Ignoring excess node %d (%Lx:%Lx)\n", nodeid, + base, limit); + continue; + } + + if (!limit) { + pr_info("Skipping node entry %d (base %Lx)\n", + i, base); + continue; + } + if ((base >> 8) & 3 || (limit >> 8) & 3) { + pr_err("Node %d using interleaving mode %Lx/%Lx\n", + nodeid, (base >> 8) & 3, (limit >> 8) & 3); + return -EINVAL; + } + if (node_isset(nodeid, numa_nodes_parsed)) { + pr_info("Node %d already present, skipping\n", + nodeid); + continue; + } + + limit >>= 16; + limit++; + limit <<= 24; + + if (limit > end) + limit = end; + if (limit <= base) + continue; + + base >>= 16; + base <<= 24; + + if (base < start) + base = start; + if (limit > end) + limit = end; + if (limit == base) { + pr_err("Empty node %d\n", nodeid); + continue; + } + if (limit < base) { + pr_err("Node %d bogus settings %Lx-%Lx.\n", + nodeid, base, limit); + continue; + } + + /* Could sort here, but pun for now. Should not happen anyroads. */ + if (prevbase > base) { + pr_err("Node map not sorted %Lx,%Lx\n", + prevbase, base); + return -EINVAL; + } + + pr_info("Node %d MemBase %016Lx Limit %016Lx\n", + nodeid, base, limit); + + prevbase = base; + numa_add_memblk(nodeid, base, limit); + node_set(nodeid, numa_nodes_parsed); + } + + if (nodes_empty(numa_nodes_parsed)) + return -ENOENT; + + /* + * We seem to have valid NUMA configuration. Map apicids to nodes + * using the coreid bits from early_identify_cpu. + */ + bits = boot_cpu_data.x86_coreid_bits; + cores = 1 << bits; + apicid_base = 0; + + /* + * get boot-time SMP configuration: + */ + early_get_smp_config(); + + if (boot_cpu_physical_apicid > 0) { + pr_info("BSP APIC ID: %02x\n", boot_cpu_physical_apicid); + apicid_base = boot_cpu_physical_apicid; + } + + for_each_node_mask(i, numa_nodes_parsed) + for (j = apicid_base; j < cores + apicid_base; j++) + set_apicid_to_node((i << bits) + j, i); + + return 0; +} diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c new file mode 100644 index 0000000000..e91500a809 --- /dev/null +++ b/arch/x86/mm/cpu_entry_area.c @@ -0,0 +1,279 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/spinlock.h> +#include <linux/percpu.h> +#include <linux/kallsyms.h> +#include <linux/kcore.h> +#include <linux/pgtable.h> + +#include <asm/cpu_entry_area.h> +#include <asm/fixmap.h> +#include <asm/desc.h> +#include <asm/kasan.h> +#include <asm/setup.h> + +static DEFINE_PER_CPU_PAGE_ALIGNED(struct entry_stack_page, entry_stack_storage); + +#ifdef CONFIG_X86_64 +static DEFINE_PER_CPU_PAGE_ALIGNED(struct exception_stacks, exception_stacks); +DEFINE_PER_CPU(struct cea_exception_stacks*, cea_exception_stacks); + +static DEFINE_PER_CPU_READ_MOSTLY(unsigned long, _cea_offset); + +static __always_inline unsigned int cea_offset(unsigned int cpu) +{ + return per_cpu(_cea_offset, cpu); +} + +static __init void init_cea_offsets(void) +{ + unsigned int max_cea; + unsigned int i, j; + + if (!kaslr_enabled()) { + for_each_possible_cpu(i) + per_cpu(_cea_offset, i) = i; + return; + } + + max_cea = (CPU_ENTRY_AREA_MAP_SIZE - PAGE_SIZE) / CPU_ENTRY_AREA_SIZE; + + /* O(sodding terrible) */ + for_each_possible_cpu(i) { + unsigned int cea; + +again: + cea = get_random_u32_below(max_cea); + + for_each_possible_cpu(j) { + if (cea_offset(j) == cea) + goto again; + + if (i == j) + break; + } + + per_cpu(_cea_offset, i) = cea; + } +} +#else /* !X86_64 */ +DECLARE_PER_CPU_PAGE_ALIGNED(struct doublefault_stack, doublefault_stack); + +static __always_inline unsigned int cea_offset(unsigned int cpu) +{ + return cpu; +} +static inline void init_cea_offsets(void) { } +#endif + +/* Is called from entry code, so must be noinstr */ +noinstr struct cpu_entry_area *get_cpu_entry_area(int cpu) +{ + unsigned long va = CPU_ENTRY_AREA_PER_CPU + cea_offset(cpu) * CPU_ENTRY_AREA_SIZE; + BUILD_BUG_ON(sizeof(struct cpu_entry_area) % PAGE_SIZE != 0); + + return (struct cpu_entry_area *) va; +} +EXPORT_SYMBOL(get_cpu_entry_area); + +void cea_set_pte(void *cea_vaddr, phys_addr_t pa, pgprot_t flags) +{ + unsigned long va = (unsigned long) cea_vaddr; + pte_t pte = pfn_pte(pa >> PAGE_SHIFT, flags); + + /* + * The cpu_entry_area is shared between the user and kernel + * page tables. All of its ptes can safely be global. + * _PAGE_GLOBAL gets reused to help indicate PROT_NONE for + * non-present PTEs, so be careful not to set it in that + * case to avoid confusion. + */ + if (boot_cpu_has(X86_FEATURE_PGE) && + (pgprot_val(flags) & _PAGE_PRESENT)) + pte = pte_set_flags(pte, _PAGE_GLOBAL); + + set_pte_vaddr(va, pte); +} + +static void __init +cea_map_percpu_pages(void *cea_vaddr, void *ptr, int pages, pgprot_t prot) +{ + for ( ; pages; pages--, cea_vaddr+= PAGE_SIZE, ptr += PAGE_SIZE) + cea_set_pte(cea_vaddr, per_cpu_ptr_to_phys(ptr), prot); +} + +static void __init percpu_setup_debug_store(unsigned int cpu) +{ +#ifdef CONFIG_CPU_SUP_INTEL + unsigned int npages; + void *cea; + + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + return; + + cea = &get_cpu_entry_area(cpu)->cpu_debug_store; + npages = sizeof(struct debug_store) / PAGE_SIZE; + BUILD_BUG_ON(sizeof(struct debug_store) % PAGE_SIZE != 0); + cea_map_percpu_pages(cea, &per_cpu(cpu_debug_store, cpu), npages, + PAGE_KERNEL); + + cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers; + /* + * Force the population of PMDs for not yet allocated per cpu + * memory like debug store buffers. + */ + npages = sizeof(struct debug_store_buffers) / PAGE_SIZE; + for (; npages; npages--, cea += PAGE_SIZE) + cea_set_pte(cea, 0, PAGE_NONE); +#endif +} + +#ifdef CONFIG_X86_64 + +#define cea_map_stack(name) do { \ + npages = sizeof(estacks->name## _stack) / PAGE_SIZE; \ + cea_map_percpu_pages(cea->estacks.name## _stack, \ + estacks->name## _stack, npages, PAGE_KERNEL); \ + } while (0) + +static void __init percpu_setup_exception_stacks(unsigned int cpu) +{ + struct exception_stacks *estacks = per_cpu_ptr(&exception_stacks, cpu); + struct cpu_entry_area *cea = get_cpu_entry_area(cpu); + unsigned int npages; + + BUILD_BUG_ON(sizeof(exception_stacks) % PAGE_SIZE != 0); + + per_cpu(cea_exception_stacks, cpu) = &cea->estacks; + + /* + * The exceptions stack mappings in the per cpu area are protected + * by guard pages so each stack must be mapped separately. DB2 is + * not mapped; it just exists to catch triple nesting of #DB. + */ + cea_map_stack(DF); + cea_map_stack(NMI); + cea_map_stack(DB); + cea_map_stack(MCE); + + if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) { + if (cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT)) { + cea_map_stack(VC); + cea_map_stack(VC2); + } + } +} +#else +static inline void percpu_setup_exception_stacks(unsigned int cpu) +{ + struct cpu_entry_area *cea = get_cpu_entry_area(cpu); + + cea_map_percpu_pages(&cea->doublefault_stack, + &per_cpu(doublefault_stack, cpu), 1, PAGE_KERNEL); +} +#endif + +/* Setup the fixmap mappings only once per-processor */ +static void __init setup_cpu_entry_area(unsigned int cpu) +{ + struct cpu_entry_area *cea = get_cpu_entry_area(cpu); +#ifdef CONFIG_X86_64 + /* On 64-bit systems, we use a read-only fixmap GDT and TSS. */ + pgprot_t gdt_prot = PAGE_KERNEL_RO; + pgprot_t tss_prot = PAGE_KERNEL_RO; +#else + /* + * On 32-bit systems, the GDT cannot be read-only because + * our double fault handler uses a task gate, and entering through + * a task gate needs to change an available TSS to busy. If the + * GDT is read-only, that will triple fault. The TSS cannot be + * read-only because the CPU writes to it on task switches. + */ + pgprot_t gdt_prot = PAGE_KERNEL; + pgprot_t tss_prot = PAGE_KERNEL; +#endif + + kasan_populate_shadow_for_vaddr(cea, CPU_ENTRY_AREA_SIZE, + early_cpu_to_node(cpu)); + + cea_set_pte(&cea->gdt, get_cpu_gdt_paddr(cpu), gdt_prot); + + cea_map_percpu_pages(&cea->entry_stack_page, + per_cpu_ptr(&entry_stack_storage, cpu), 1, + PAGE_KERNEL); + + /* + * The Intel SDM says (Volume 3, 7.2.1): + * + * Avoid placing a page boundary in the part of the TSS that the + * processor reads during a task switch (the first 104 bytes). The + * processor may not correctly perform address translations if a + * boundary occurs in this area. During a task switch, the processor + * reads and writes into the first 104 bytes of each TSS (using + * contiguous physical addresses beginning with the physical address + * of the first byte of the TSS). So, after TSS access begins, if + * part of the 104 bytes is not physically contiguous, the processor + * will access incorrect information without generating a page-fault + * exception. + * + * There are also a lot of errata involving the TSS spanning a page + * boundary. Assert that we're not doing that. + */ + BUILD_BUG_ON((offsetof(struct tss_struct, x86_tss) ^ + offsetofend(struct tss_struct, x86_tss)) & PAGE_MASK); + BUILD_BUG_ON(sizeof(struct tss_struct) % PAGE_SIZE != 0); + /* + * VMX changes the host TR limit to 0x67 after a VM exit. This is + * okay, since 0x67 covers the size of struct x86_hw_tss. Make sure + * that this is correct. + */ + BUILD_BUG_ON(offsetof(struct tss_struct, x86_tss) != 0); + BUILD_BUG_ON(sizeof(struct x86_hw_tss) != 0x68); + + cea_map_percpu_pages(&cea->tss, &per_cpu(cpu_tss_rw, cpu), + sizeof(struct tss_struct) / PAGE_SIZE, tss_prot); + +#ifdef CONFIG_X86_32 + per_cpu(cpu_entry_area, cpu) = cea; +#endif + + percpu_setup_exception_stacks(cpu); + + percpu_setup_debug_store(cpu); +} + +static __init void setup_cpu_entry_area_ptes(void) +{ +#ifdef CONFIG_X86_32 + unsigned long start, end; + + /* The +1 is for the readonly IDT: */ + BUILD_BUG_ON((CPU_ENTRY_AREA_PAGES+1)*PAGE_SIZE != CPU_ENTRY_AREA_MAP_SIZE); + BUG_ON(CPU_ENTRY_AREA_BASE & ~PMD_MASK); + + start = CPU_ENTRY_AREA_BASE; + end = start + CPU_ENTRY_AREA_MAP_SIZE; + + /* Careful here: start + PMD_SIZE might wrap around */ + for (; start < end && start >= CPU_ENTRY_AREA_BASE; start += PMD_SIZE) + populate_extra_pte(start); +#endif +} + +void __init setup_cpu_entry_areas(void) +{ + unsigned int cpu; + + init_cea_offsets(); + + setup_cpu_entry_area_ptes(); + + for_each_possible_cpu(cpu) + setup_cpu_entry_area(cpu); + + /* + * This is the last essential update to swapper_pgdir which needs + * to be synchronized to initial_page_table on 32bit. + */ + sync_initial_page_table(); +} diff --git a/arch/x86/mm/debug_pagetables.c b/arch/x86/mm/debug_pagetables.c new file mode 100644 index 0000000000..b43301cb2a --- /dev/null +++ b/arch/x86/mm/debug_pagetables.c @@ -0,0 +1,75 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <linux/debugfs.h> +#include <linux/efi.h> +#include <linux/module.h> +#include <linux/seq_file.h> +#include <linux/pgtable.h> + +static int ptdump_show(struct seq_file *m, void *v) +{ + ptdump_walk_pgd_level_debugfs(m, &init_mm, false); + return 0; +} + +DEFINE_SHOW_ATTRIBUTE(ptdump); + +static int ptdump_curknl_show(struct seq_file *m, void *v) +{ + if (current->mm->pgd) + ptdump_walk_pgd_level_debugfs(m, current->mm, false); + return 0; +} + +DEFINE_SHOW_ATTRIBUTE(ptdump_curknl); + +#ifdef CONFIG_PAGE_TABLE_ISOLATION +static int ptdump_curusr_show(struct seq_file *m, void *v) +{ + if (current->mm->pgd) + ptdump_walk_pgd_level_debugfs(m, current->mm, true); + return 0; +} + +DEFINE_SHOW_ATTRIBUTE(ptdump_curusr); +#endif + +#if defined(CONFIG_EFI) && defined(CONFIG_X86_64) +static int ptdump_efi_show(struct seq_file *m, void *v) +{ + if (efi_mm.pgd) + ptdump_walk_pgd_level_debugfs(m, &efi_mm, false); + return 0; +} + +DEFINE_SHOW_ATTRIBUTE(ptdump_efi); +#endif + +static struct dentry *dir; + +static int __init pt_dump_debug_init(void) +{ + dir = debugfs_create_dir("page_tables", NULL); + + debugfs_create_file("kernel", 0400, dir, NULL, &ptdump_fops); + debugfs_create_file("current_kernel", 0400, dir, NULL, + &ptdump_curknl_fops); + +#ifdef CONFIG_PAGE_TABLE_ISOLATION + debugfs_create_file("current_user", 0400, dir, NULL, + &ptdump_curusr_fops); +#endif +#if defined(CONFIG_EFI) && defined(CONFIG_X86_64) + debugfs_create_file("efi", 0400, dir, NULL, &ptdump_efi_fops); +#endif + return 0; +} + +static void __exit pt_dump_debug_exit(void) +{ + debugfs_remove_recursive(dir); +} + +module_init(pt_dump_debug_init); +module_exit(pt_dump_debug_exit); +MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>"); +MODULE_DESCRIPTION("Kernel debugging helper that dumps pagetables"); diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c new file mode 100644 index 0000000000..e1b599ecbb --- /dev/null +++ b/arch/x86/mm/dump_pagetables.c @@ -0,0 +1,471 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Debug helper to dump the current kernel pagetables of the system + * so that we can see what the various memory ranges are set to. + * + * (C) Copyright 2008 Intel Corporation + * + * Author: Arjan van de Ven <arjan@linux.intel.com> + */ + +#include <linux/debugfs.h> +#include <linux/kasan.h> +#include <linux/mm.h> +#include <linux/init.h> +#include <linux/sched.h> +#include <linux/seq_file.h> +#include <linux/highmem.h> +#include <linux/pci.h> +#include <linux/ptdump.h> + +#include <asm/e820/types.h> + +/* + * The dumper groups pagetable entries of the same type into one, and for + * that it needs to keep some state when walking, and flush this state + * when a "break" in the continuity is found. + */ +struct pg_state { + struct ptdump_state ptdump; + int level; + pgprotval_t current_prot; + pgprotval_t effective_prot; + pgprotval_t prot_levels[5]; + unsigned long start_address; + const struct addr_marker *marker; + unsigned long lines; + bool to_dmesg; + bool check_wx; + unsigned long wx_pages; + struct seq_file *seq; +}; + +struct addr_marker { + unsigned long start_address; + const char *name; + unsigned long max_lines; +}; + +/* Address space markers hints */ + +#ifdef CONFIG_X86_64 + +enum address_markers_idx { + USER_SPACE_NR = 0, + KERNEL_SPACE_NR, +#ifdef CONFIG_MODIFY_LDT_SYSCALL + LDT_NR, +#endif + LOW_KERNEL_NR, + VMALLOC_START_NR, + VMEMMAP_START_NR, +#ifdef CONFIG_KASAN + KASAN_SHADOW_START_NR, + KASAN_SHADOW_END_NR, +#endif + CPU_ENTRY_AREA_NR, +#ifdef CONFIG_X86_ESPFIX64 + ESPFIX_START_NR, +#endif +#ifdef CONFIG_EFI + EFI_END_NR, +#endif + HIGH_KERNEL_NR, + MODULES_VADDR_NR, + MODULES_END_NR, + FIXADDR_START_NR, + END_OF_SPACE_NR, +}; + +static struct addr_marker address_markers[] = { + [USER_SPACE_NR] = { 0, "User Space" }, + [KERNEL_SPACE_NR] = { (1UL << 63), "Kernel Space" }, + [LOW_KERNEL_NR] = { 0UL, "Low Kernel Mapping" }, + [VMALLOC_START_NR] = { 0UL, "vmalloc() Area" }, + [VMEMMAP_START_NR] = { 0UL, "Vmemmap" }, +#ifdef CONFIG_KASAN + /* + * These fields get initialized with the (dynamic) + * KASAN_SHADOW_{START,END} values in pt_dump_init(). + */ + [KASAN_SHADOW_START_NR] = { 0UL, "KASAN shadow" }, + [KASAN_SHADOW_END_NR] = { 0UL, "KASAN shadow end" }, +#endif +#ifdef CONFIG_MODIFY_LDT_SYSCALL + [LDT_NR] = { 0UL, "LDT remap" }, +#endif + [CPU_ENTRY_AREA_NR] = { CPU_ENTRY_AREA_BASE,"CPU entry Area" }, +#ifdef CONFIG_X86_ESPFIX64 + [ESPFIX_START_NR] = { ESPFIX_BASE_ADDR, "ESPfix Area", 16 }, +#endif +#ifdef CONFIG_EFI + [EFI_END_NR] = { EFI_VA_END, "EFI Runtime Services" }, +#endif + [HIGH_KERNEL_NR] = { __START_KERNEL_map, "High Kernel Mapping" }, + [MODULES_VADDR_NR] = { MODULES_VADDR, "Modules" }, + [MODULES_END_NR] = { MODULES_END, "End Modules" }, + [FIXADDR_START_NR] = { FIXADDR_START, "Fixmap Area" }, + [END_OF_SPACE_NR] = { -1, NULL } +}; + +#define INIT_PGD ((pgd_t *) &init_top_pgt) + +#else /* CONFIG_X86_64 */ + +enum address_markers_idx { + USER_SPACE_NR = 0, + KERNEL_SPACE_NR, + VMALLOC_START_NR, + VMALLOC_END_NR, +#ifdef CONFIG_HIGHMEM + PKMAP_BASE_NR, +#endif +#ifdef CONFIG_MODIFY_LDT_SYSCALL + LDT_NR, +#endif + CPU_ENTRY_AREA_NR, + FIXADDR_START_NR, + END_OF_SPACE_NR, +}; + +static struct addr_marker address_markers[] = { + [USER_SPACE_NR] = { 0, "User Space" }, + [KERNEL_SPACE_NR] = { PAGE_OFFSET, "Kernel Mapping" }, + [VMALLOC_START_NR] = { 0UL, "vmalloc() Area" }, + [VMALLOC_END_NR] = { 0UL, "vmalloc() End" }, +#ifdef CONFIG_HIGHMEM + [PKMAP_BASE_NR] = { 0UL, "Persistent kmap() Area" }, +#endif +#ifdef CONFIG_MODIFY_LDT_SYSCALL + [LDT_NR] = { 0UL, "LDT remap" }, +#endif + [CPU_ENTRY_AREA_NR] = { 0UL, "CPU entry area" }, + [FIXADDR_START_NR] = { 0UL, "Fixmap area" }, + [END_OF_SPACE_NR] = { -1, NULL } +}; + +#define INIT_PGD (swapper_pg_dir) + +#endif /* !CONFIG_X86_64 */ + +/* Multipliers for offsets within the PTEs */ +#define PTE_LEVEL_MULT (PAGE_SIZE) +#define PMD_LEVEL_MULT (PTRS_PER_PTE * PTE_LEVEL_MULT) +#define PUD_LEVEL_MULT (PTRS_PER_PMD * PMD_LEVEL_MULT) +#define P4D_LEVEL_MULT (PTRS_PER_PUD * PUD_LEVEL_MULT) +#define PGD_LEVEL_MULT (PTRS_PER_P4D * P4D_LEVEL_MULT) + +#define pt_dump_seq_printf(m, to_dmesg, fmt, args...) \ +({ \ + if (to_dmesg) \ + printk(KERN_INFO fmt, ##args); \ + else \ + if (m) \ + seq_printf(m, fmt, ##args); \ +}) + +#define pt_dump_cont_printf(m, to_dmesg, fmt, args...) \ +({ \ + if (to_dmesg) \ + printk(KERN_CONT fmt, ##args); \ + else \ + if (m) \ + seq_printf(m, fmt, ##args); \ +}) + +/* + * Print a readable form of a pgprot_t to the seq_file + */ +static void printk_prot(struct seq_file *m, pgprotval_t pr, int level, bool dmsg) +{ + static const char * const level_name[] = + { "pgd", "p4d", "pud", "pmd", "pte" }; + + if (!(pr & _PAGE_PRESENT)) { + /* Not present */ + pt_dump_cont_printf(m, dmsg, " "); + } else { + if (pr & _PAGE_USER) + pt_dump_cont_printf(m, dmsg, "USR "); + else + pt_dump_cont_printf(m, dmsg, " "); + if (pr & _PAGE_RW) + pt_dump_cont_printf(m, dmsg, "RW "); + else + pt_dump_cont_printf(m, dmsg, "ro "); + if (pr & _PAGE_PWT) + pt_dump_cont_printf(m, dmsg, "PWT "); + else + pt_dump_cont_printf(m, dmsg, " "); + if (pr & _PAGE_PCD) + pt_dump_cont_printf(m, dmsg, "PCD "); + else + pt_dump_cont_printf(m, dmsg, " "); + + /* Bit 7 has a different meaning on level 3 vs 4 */ + if (level <= 3 && pr & _PAGE_PSE) + pt_dump_cont_printf(m, dmsg, "PSE "); + else + pt_dump_cont_printf(m, dmsg, " "); + if ((level == 4 && pr & _PAGE_PAT) || + ((level == 3 || level == 2) && pr & _PAGE_PAT_LARGE)) + pt_dump_cont_printf(m, dmsg, "PAT "); + else + pt_dump_cont_printf(m, dmsg, " "); + if (pr & _PAGE_GLOBAL) + pt_dump_cont_printf(m, dmsg, "GLB "); + else + pt_dump_cont_printf(m, dmsg, " "); + if (pr & _PAGE_NX) + pt_dump_cont_printf(m, dmsg, "NX "); + else + pt_dump_cont_printf(m, dmsg, "x "); + } + pt_dump_cont_printf(m, dmsg, "%s\n", level_name[level]); +} + +static void note_wx(struct pg_state *st, unsigned long addr) +{ + unsigned long npages; + + npages = (addr - st->start_address) / PAGE_SIZE; + +#ifdef CONFIG_PCI_BIOS + /* + * If PCI BIOS is enabled, the PCI BIOS area is forced to WX. + * Inform about it, but avoid the warning. + */ + if (pcibios_enabled && st->start_address >= PAGE_OFFSET + BIOS_BEGIN && + addr <= PAGE_OFFSET + BIOS_END) { + pr_warn_once("x86/mm: PCI BIOS W+X mapping %lu pages\n", npages); + return; + } +#endif + /* Account the WX pages */ + st->wx_pages += npages; + WARN_ONCE(__supported_pte_mask & _PAGE_NX, + "x86/mm: Found insecure W+X mapping at address %pS\n", + (void *)st->start_address); +} + +static void effective_prot(struct ptdump_state *pt_st, int level, u64 val) +{ + struct pg_state *st = container_of(pt_st, struct pg_state, ptdump); + pgprotval_t prot = val & PTE_FLAGS_MASK; + pgprotval_t effective; + + if (level > 0) { + pgprotval_t higher_prot = st->prot_levels[level - 1]; + + effective = (higher_prot & prot & (_PAGE_USER | _PAGE_RW)) | + ((higher_prot | prot) & _PAGE_NX); + } else { + effective = prot; + } + + st->prot_levels[level] = effective; +} + +/* + * This function gets called on a break in a continuous series + * of PTE entries; the next one is different so we need to + * print what we collected so far. + */ +static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, + u64 val) +{ + struct pg_state *st = container_of(pt_st, struct pg_state, ptdump); + pgprotval_t new_prot, new_eff; + pgprotval_t cur, eff; + static const char units[] = "BKMGTPE"; + struct seq_file *m = st->seq; + + new_prot = val & PTE_FLAGS_MASK; + if (!val) + new_eff = 0; + else + new_eff = st->prot_levels[level]; + + /* + * If we have a "break" in the series, we need to flush the state that + * we have now. "break" is either changing perms, levels or + * address space marker. + */ + cur = st->current_prot; + eff = st->effective_prot; + + if (st->level == -1) { + /* First entry */ + st->current_prot = new_prot; + st->effective_prot = new_eff; + st->level = level; + st->marker = address_markers; + st->lines = 0; + pt_dump_seq_printf(m, st->to_dmesg, "---[ %s ]---\n", + st->marker->name); + } else if (new_prot != cur || new_eff != eff || level != st->level || + addr >= st->marker[1].start_address) { + const char *unit = units; + unsigned long delta; + int width = sizeof(unsigned long) * 2; + + if (st->check_wx && (eff & _PAGE_RW) && !(eff & _PAGE_NX)) + note_wx(st, addr); + + /* + * Now print the actual finished series + */ + if (!st->marker->max_lines || + st->lines < st->marker->max_lines) { + pt_dump_seq_printf(m, st->to_dmesg, + "0x%0*lx-0x%0*lx ", + width, st->start_address, + width, addr); + + delta = addr - st->start_address; + while (!(delta & 1023) && unit[1]) { + delta >>= 10; + unit++; + } + pt_dump_cont_printf(m, st->to_dmesg, "%9lu%c ", + delta, *unit); + printk_prot(m, st->current_prot, st->level, + st->to_dmesg); + } + st->lines++; + + /* + * We print markers for special areas of address space, + * such as the start of vmalloc space etc. + * This helps in the interpretation. + */ + if (addr >= st->marker[1].start_address) { + if (st->marker->max_lines && + st->lines > st->marker->max_lines) { + unsigned long nskip = + st->lines - st->marker->max_lines; + pt_dump_seq_printf(m, st->to_dmesg, + "... %lu entr%s skipped ... \n", + nskip, + nskip == 1 ? "y" : "ies"); + } + st->marker++; + st->lines = 0; + pt_dump_seq_printf(m, st->to_dmesg, "---[ %s ]---\n", + st->marker->name); + } + + st->start_address = addr; + st->current_prot = new_prot; + st->effective_prot = new_eff; + st->level = level; + } +} + +static void ptdump_walk_pgd_level_core(struct seq_file *m, + struct mm_struct *mm, pgd_t *pgd, + bool checkwx, bool dmesg) +{ + const struct ptdump_range ptdump_ranges[] = { +#ifdef CONFIG_X86_64 + {0, PTRS_PER_PGD * PGD_LEVEL_MULT / 2}, + {GUARD_HOLE_END_ADDR, ~0UL}, +#else + {0, ~0UL}, +#endif + {0, 0} +}; + + struct pg_state st = { + .ptdump = { + .note_page = note_page, + .effective_prot = effective_prot, + .range = ptdump_ranges + }, + .level = -1, + .to_dmesg = dmesg, + .check_wx = checkwx, + .seq = m + }; + + ptdump_walk_pgd(&st.ptdump, mm, pgd); + + if (!checkwx) + return; + if (st.wx_pages) + pr_info("x86/mm: Checked W+X mappings: FAILED, %lu W+X pages found.\n", + st.wx_pages); + else + pr_info("x86/mm: Checked W+X mappings: passed, no W+X pages found.\n"); +} + +void ptdump_walk_pgd_level(struct seq_file *m, struct mm_struct *mm) +{ + ptdump_walk_pgd_level_core(m, mm, mm->pgd, false, true); +} + +void ptdump_walk_pgd_level_debugfs(struct seq_file *m, struct mm_struct *mm, + bool user) +{ + pgd_t *pgd = mm->pgd; +#ifdef CONFIG_PAGE_TABLE_ISOLATION + if (user && boot_cpu_has(X86_FEATURE_PTI)) + pgd = kernel_to_user_pgdp(pgd); +#endif + ptdump_walk_pgd_level_core(m, mm, pgd, false, false); +} +EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level_debugfs); + +void ptdump_walk_user_pgd_level_checkwx(void) +{ +#ifdef CONFIG_PAGE_TABLE_ISOLATION + pgd_t *pgd = INIT_PGD; + + if (!(__supported_pte_mask & _PAGE_NX) || + !boot_cpu_has(X86_FEATURE_PTI)) + return; + + pr_info("x86/mm: Checking user space page tables\n"); + pgd = kernel_to_user_pgdp(pgd); + ptdump_walk_pgd_level_core(NULL, &init_mm, pgd, true, false); +#endif +} + +void ptdump_walk_pgd_level_checkwx(void) +{ + ptdump_walk_pgd_level_core(NULL, &init_mm, INIT_PGD, true, false); +} + +static int __init pt_dump_init(void) +{ + /* + * Various markers are not compile-time constants, so assign them + * here. + */ +#ifdef CONFIG_X86_64 + address_markers[LOW_KERNEL_NR].start_address = PAGE_OFFSET; + address_markers[VMALLOC_START_NR].start_address = VMALLOC_START; + address_markers[VMEMMAP_START_NR].start_address = VMEMMAP_START; +#ifdef CONFIG_MODIFY_LDT_SYSCALL + address_markers[LDT_NR].start_address = LDT_BASE_ADDR; +#endif +#ifdef CONFIG_KASAN + address_markers[KASAN_SHADOW_START_NR].start_address = KASAN_SHADOW_START; + address_markers[KASAN_SHADOW_END_NR].start_address = KASAN_SHADOW_END; +#endif +#endif +#ifdef CONFIG_X86_32 + address_markers[VMALLOC_START_NR].start_address = VMALLOC_START; + address_markers[VMALLOC_END_NR].start_address = VMALLOC_END; +# ifdef CONFIG_HIGHMEM + address_markers[PKMAP_BASE_NR].start_address = PKMAP_BASE; +# endif + address_markers[FIXADDR_START_NR].start_address = FIXADDR_START; + address_markers[CPU_ENTRY_AREA_NR].start_address = CPU_ENTRY_AREA_BASE; +# ifdef CONFIG_MODIFY_LDT_SYSCALL + address_markers[LDT_NR].start_address = LDT_BASE_ADDR; +# endif +#endif + return 0; +} +__initcall(pt_dump_init); diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c new file mode 100644 index 0000000000..271dcb2dea --- /dev/null +++ b/arch/x86/mm/extable.c @@ -0,0 +1,369 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <linux/extable.h> +#include <linux/uaccess.h> +#include <linux/sched/debug.h> +#include <linux/bitfield.h> +#include <xen/xen.h> + +#include <asm/fpu/api.h> +#include <asm/sev.h> +#include <asm/traps.h> +#include <asm/kdebug.h> +#include <asm/insn-eval.h> +#include <asm/sgx.h> + +static inline unsigned long *pt_regs_nr(struct pt_regs *regs, int nr) +{ + int reg_offset = pt_regs_offset(regs, nr); + static unsigned long __dummy; + + if (WARN_ON_ONCE(reg_offset < 0)) + return &__dummy; + + return (unsigned long *)((unsigned long)regs + reg_offset); +} + +static inline unsigned long +ex_fixup_addr(const struct exception_table_entry *x) +{ + return (unsigned long)&x->fixup + x->fixup; +} + +static bool ex_handler_default(const struct exception_table_entry *e, + struct pt_regs *regs) +{ + if (e->data & EX_FLAG_CLEAR_AX) + regs->ax = 0; + if (e->data & EX_FLAG_CLEAR_DX) + regs->dx = 0; + + regs->ip = ex_fixup_addr(e); + return true; +} + +/* + * This is the *very* rare case where we do a "load_unaligned_zeropad()" + * and it's a page crosser into a non-existent page. + * + * This happens when we optimistically load a pathname a word-at-a-time + * and the name is less than the full word and the next page is not + * mapped. Typically that only happens for CONFIG_DEBUG_PAGEALLOC. + * + * NOTE! The faulting address is always a 'mov mem,reg' type instruction + * of size 'long', and the exception fixup must always point to right + * after the instruction. + */ +static bool ex_handler_zeropad(const struct exception_table_entry *e, + struct pt_regs *regs, + unsigned long fault_addr) +{ + struct insn insn; + const unsigned long mask = sizeof(long) - 1; + unsigned long offset, addr, next_ip, len; + unsigned long *reg; + + next_ip = ex_fixup_addr(e); + len = next_ip - regs->ip; + if (len > MAX_INSN_SIZE) + return false; + + if (insn_decode(&insn, (void *) regs->ip, len, INSN_MODE_KERN)) + return false; + if (insn.length != len) + return false; + + if (insn.opcode.bytes[0] != 0x8b) + return false; + if (insn.opnd_bytes != sizeof(long)) + return false; + + addr = (unsigned long) insn_get_addr_ref(&insn, regs); + if (addr == ~0ul) + return false; + + offset = addr & mask; + addr = addr & ~mask; + if (fault_addr != addr + sizeof(long)) + return false; + + reg = insn_get_modrm_reg_ptr(&insn, regs); + if (!reg) + return false; + + *reg = *(unsigned long *)addr >> (offset * 8); + return ex_handler_default(e, regs); +} + +static bool ex_handler_fault(const struct exception_table_entry *fixup, + struct pt_regs *regs, int trapnr) +{ + regs->ax = trapnr; + return ex_handler_default(fixup, regs); +} + +static bool ex_handler_sgx(const struct exception_table_entry *fixup, + struct pt_regs *regs, int trapnr) +{ + regs->ax = trapnr | SGX_ENCLS_FAULT_FLAG; + return ex_handler_default(fixup, regs); +} + +/* + * Handler for when we fail to restore a task's FPU state. We should never get + * here because the FPU state of a task using the FPU (task->thread.fpu.state) + * should always be valid. However, past bugs have allowed userspace to set + * reserved bits in the XSAVE area using PTRACE_SETREGSET or sys_rt_sigreturn(). + * These caused XRSTOR to fail when switching to the task, leaking the FPU + * registers of the task previously executing on the CPU. Mitigate this class + * of vulnerability by restoring from the initial state (essentially, zeroing + * out all the FPU registers) if we can't restore from the task's FPU state. + */ +static bool ex_handler_fprestore(const struct exception_table_entry *fixup, + struct pt_regs *regs) +{ + regs->ip = ex_fixup_addr(fixup); + + WARN_ONCE(1, "Bad FPU state detected at %pB, reinitializing FPU registers.", + (void *)instruction_pointer(regs)); + + fpu_reset_from_exception_fixup(); + return true; +} + +/* + * On x86-64, we end up being imprecise with 'access_ok()', and allow + * non-canonical user addresses to make the range comparisons simpler, + * and to not have to worry about LAM being enabled. + * + * In fact, we allow up to one page of "slop" at the sign boundary, + * which means that we can do access_ok() by just checking the sign + * of the pointer for the common case of having a small access size. + */ +static bool gp_fault_address_ok(unsigned long fault_address) +{ +#ifdef CONFIG_X86_64 + /* Is it in the "user space" part of the non-canonical space? */ + if (valid_user_address(fault_address)) + return true; + + /* .. or just above it? */ + fault_address -= PAGE_SIZE; + if (valid_user_address(fault_address)) + return true; +#endif + return false; +} + +static bool ex_handler_uaccess(const struct exception_table_entry *fixup, + struct pt_regs *regs, int trapnr, + unsigned long fault_address) +{ + WARN_ONCE(trapnr == X86_TRAP_GP && !gp_fault_address_ok(fault_address), + "General protection fault in user access. Non-canonical address?"); + return ex_handler_default(fixup, regs); +} + +static bool ex_handler_copy(const struct exception_table_entry *fixup, + struct pt_regs *regs, int trapnr) +{ + WARN_ONCE(trapnr == X86_TRAP_GP, "General protection fault in user access. Non-canonical address?"); + return ex_handler_fault(fixup, regs, trapnr); +} + +static bool ex_handler_msr(const struct exception_table_entry *fixup, + struct pt_regs *regs, bool wrmsr, bool safe, int reg) +{ + if (__ONCE_LITE_IF(!safe && wrmsr)) { + pr_warn("unchecked MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x) at rIP: 0x%lx (%pS)\n", + (unsigned int)regs->cx, (unsigned int)regs->dx, + (unsigned int)regs->ax, regs->ip, (void *)regs->ip); + show_stack_regs(regs); + } + + if (__ONCE_LITE_IF(!safe && !wrmsr)) { + pr_warn("unchecked MSR access error: RDMSR from 0x%x at rIP: 0x%lx (%pS)\n", + (unsigned int)regs->cx, regs->ip, (void *)regs->ip); + show_stack_regs(regs); + } + + if (!wrmsr) { + /* Pretend that the read succeeded and returned 0. */ + regs->ax = 0; + regs->dx = 0; + } + + if (safe) + *pt_regs_nr(regs, reg) = -EIO; + + return ex_handler_default(fixup, regs); +} + +static bool ex_handler_clear_fs(const struct exception_table_entry *fixup, + struct pt_regs *regs) +{ + if (static_cpu_has(X86_BUG_NULL_SEG)) + asm volatile ("mov %0, %%fs" : : "rm" (__USER_DS)); + asm volatile ("mov %0, %%fs" : : "rm" (0)); + return ex_handler_default(fixup, regs); +} + +static bool ex_handler_imm_reg(const struct exception_table_entry *fixup, + struct pt_regs *regs, int reg, int imm) +{ + *pt_regs_nr(regs, reg) = (long)imm; + return ex_handler_default(fixup, regs); +} + +static bool ex_handler_ucopy_len(const struct exception_table_entry *fixup, + struct pt_regs *regs, int trapnr, + unsigned long fault_address, + int reg, int imm) +{ + regs->cx = imm * regs->cx + *pt_regs_nr(regs, reg); + return ex_handler_uaccess(fixup, regs, trapnr, fault_address); +} + +int ex_get_fixup_type(unsigned long ip) +{ + const struct exception_table_entry *e = search_exception_tables(ip); + + return e ? FIELD_GET(EX_DATA_TYPE_MASK, e->data) : EX_TYPE_NONE; +} + +int fixup_exception(struct pt_regs *regs, int trapnr, unsigned long error_code, + unsigned long fault_addr) +{ + const struct exception_table_entry *e; + int type, reg, imm; + +#ifdef CONFIG_PNPBIOS + if (unlikely(SEGMENT_IS_PNP_CODE(regs->cs))) { + extern u32 pnp_bios_fault_eip, pnp_bios_fault_esp; + extern u32 pnp_bios_is_utter_crap; + pnp_bios_is_utter_crap = 1; + printk(KERN_CRIT "PNPBIOS fault.. attempting recovery.\n"); + __asm__ volatile( + "movl %0, %%esp\n\t" + "jmp *%1\n\t" + : : "g" (pnp_bios_fault_esp), "g" (pnp_bios_fault_eip)); + panic("do_trap: can't hit this"); + } +#endif + + e = search_exception_tables(regs->ip); + if (!e) + return 0; + + type = FIELD_GET(EX_DATA_TYPE_MASK, e->data); + reg = FIELD_GET(EX_DATA_REG_MASK, e->data); + imm = FIELD_GET(EX_DATA_IMM_MASK, e->data); + + switch (type) { + case EX_TYPE_DEFAULT: + case EX_TYPE_DEFAULT_MCE_SAFE: + return ex_handler_default(e, regs); + case EX_TYPE_FAULT: + case EX_TYPE_FAULT_MCE_SAFE: + return ex_handler_fault(e, regs, trapnr); + case EX_TYPE_UACCESS: + return ex_handler_uaccess(e, regs, trapnr, fault_addr); + case EX_TYPE_COPY: + return ex_handler_copy(e, regs, trapnr); + case EX_TYPE_CLEAR_FS: + return ex_handler_clear_fs(e, regs); + case EX_TYPE_FPU_RESTORE: + return ex_handler_fprestore(e, regs); + case EX_TYPE_BPF: + return ex_handler_bpf(e, regs); + case EX_TYPE_WRMSR: + return ex_handler_msr(e, regs, true, false, reg); + case EX_TYPE_RDMSR: + return ex_handler_msr(e, regs, false, false, reg); + case EX_TYPE_WRMSR_SAFE: + return ex_handler_msr(e, regs, true, true, reg); + case EX_TYPE_RDMSR_SAFE: + return ex_handler_msr(e, regs, false, true, reg); + case EX_TYPE_WRMSR_IN_MCE: + ex_handler_msr_mce(regs, true); + break; + case EX_TYPE_RDMSR_IN_MCE: + ex_handler_msr_mce(regs, false); + break; + case EX_TYPE_POP_REG: + regs->sp += sizeof(long); + fallthrough; + case EX_TYPE_IMM_REG: + return ex_handler_imm_reg(e, regs, reg, imm); + case EX_TYPE_FAULT_SGX: + return ex_handler_sgx(e, regs, trapnr); + case EX_TYPE_UCOPY_LEN: + return ex_handler_ucopy_len(e, regs, trapnr, fault_addr, reg, imm); + case EX_TYPE_ZEROPAD: + return ex_handler_zeropad(e, regs, fault_addr); + } + BUG(); +} + +extern unsigned int early_recursion_flag; + +/* Restricted version used during very early boot */ +void __init early_fixup_exception(struct pt_regs *regs, int trapnr) +{ + /* Ignore early NMIs. */ + if (trapnr == X86_TRAP_NMI) + return; + + if (early_recursion_flag > 2) + goto halt_loop; + + /* + * Old CPUs leave the high bits of CS on the stack + * undefined. I'm not sure which CPUs do this, but at least + * the 486 DX works this way. + * Xen pv domains are not using the default __KERNEL_CS. + */ + if (!xen_pv_domain() && regs->cs != __KERNEL_CS) + goto fail; + + /* + * The full exception fixup machinery is available as soon as + * the early IDT is loaded. This means that it is the + * responsibility of extable users to either function correctly + * when handlers are invoked early or to simply avoid causing + * exceptions before they're ready to handle them. + * + * This is better than filtering which handlers can be used, + * because refusing to call a handler here is guaranteed to + * result in a hard-to-debug panic. + * + * Keep in mind that not all vectors actually get here. Early + * page faults, for example, are special. + */ + if (fixup_exception(regs, trapnr, regs->orig_ax, 0)) + return; + + if (trapnr == X86_TRAP_UD) { + if (report_bug(regs->ip, regs) == BUG_TRAP_TYPE_WARN) { + /* Skip the ud2. */ + regs->ip += LEN_UD2; + return; + } + + /* + * If this was a BUG and report_bug returns or if this + * was just a normal #UD, we want to continue onward and + * crash. + */ + } + +fail: + early_printk("PANIC: early exception 0x%02x IP %lx:%lx error %lx cr2 0x%lx\n", + (unsigned)trapnr, (unsigned long)regs->cs, regs->ip, + regs->orig_ax, read_cr2()); + + show_regs(regs); + +halt_loop: + while (true) + halt(); +} diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c new file mode 100644 index 0000000000..ab778eac19 --- /dev/null +++ b/arch/x86/mm/fault.c @@ -0,0 +1,1565 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 1995 Linus Torvalds + * Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs. + * Copyright (C) 2008-2009, Red Hat Inc., Ingo Molnar + */ +#include <linux/sched.h> /* test_thread_flag(), ... */ +#include <linux/sched/task_stack.h> /* task_stack_*(), ... */ +#include <linux/kdebug.h> /* oops_begin/end, ... */ +#include <linux/extable.h> /* search_exception_tables */ +#include <linux/memblock.h> /* max_low_pfn */ +#include <linux/kfence.h> /* kfence_handle_page_fault */ +#include <linux/kprobes.h> /* NOKPROBE_SYMBOL, ... */ +#include <linux/mmiotrace.h> /* kmmio_handler, ... */ +#include <linux/perf_event.h> /* perf_sw_event */ +#include <linux/hugetlb.h> /* hstate_index_to_shift */ +#include <linux/prefetch.h> /* prefetchw */ +#include <linux/context_tracking.h> /* exception_enter(), ... */ +#include <linux/uaccess.h> /* faulthandler_disabled() */ +#include <linux/efi.h> /* efi_crash_gracefully_on_page_fault()*/ +#include <linux/mm_types.h> +#include <linux/mm.h> /* find_and_lock_vma() */ + +#include <asm/cpufeature.h> /* boot_cpu_has, ... */ +#include <asm/traps.h> /* dotraplinkage, ... */ +#include <asm/fixmap.h> /* VSYSCALL_ADDR */ +#include <asm/vsyscall.h> /* emulate_vsyscall */ +#include <asm/vm86.h> /* struct vm86 */ +#include <asm/mmu_context.h> /* vma_pkey() */ +#include <asm/efi.h> /* efi_crash_gracefully_on_page_fault()*/ +#include <asm/desc.h> /* store_idt(), ... */ +#include <asm/cpu_entry_area.h> /* exception stack */ +#include <asm/pgtable_areas.h> /* VMALLOC_START, ... */ +#include <asm/kvm_para.h> /* kvm_handle_async_pf */ +#include <asm/vdso.h> /* fixup_vdso_exception() */ +#include <asm/irq_stack.h> + +#define CREATE_TRACE_POINTS +#include <asm/trace/exceptions.h> + +/* + * Returns 0 if mmiotrace is disabled, or if the fault is not + * handled by mmiotrace: + */ +static nokprobe_inline int +kmmio_fault(struct pt_regs *regs, unsigned long addr) +{ + if (unlikely(is_kmmio_active())) + if (kmmio_handler(regs, addr) == 1) + return -1; + return 0; +} + +/* + * Prefetch quirks: + * + * 32-bit mode: + * + * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch. + * Check that here and ignore it. This is AMD erratum #91. + * + * 64-bit mode: + * + * Sometimes the CPU reports invalid exceptions on prefetch. + * Check that here and ignore it. + * + * Opcode checker based on code by Richard Brunner. + */ +static inline int +check_prefetch_opcode(struct pt_regs *regs, unsigned char *instr, + unsigned char opcode, int *prefetch) +{ + unsigned char instr_hi = opcode & 0xf0; + unsigned char instr_lo = opcode & 0x0f; + + switch (instr_hi) { + case 0x20: + case 0x30: + /* + * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. + * In X86_64 long mode, the CPU will signal invalid + * opcode if some of these prefixes are present so + * X86_64 will never get here anyway + */ + return ((instr_lo & 7) == 0x6); +#ifdef CONFIG_X86_64 + case 0x40: + /* + * In 64-bit mode 0x40..0x4F are valid REX prefixes + */ + return (!user_mode(regs) || user_64bit_mode(regs)); +#endif + case 0x60: + /* 0x64 thru 0x67 are valid prefixes in all modes. */ + return (instr_lo & 0xC) == 0x4; + case 0xF0: + /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */ + return !instr_lo || (instr_lo>>1) == 1; + case 0x00: + /* Prefetch instruction is 0x0F0D or 0x0F18 */ + if (get_kernel_nofault(opcode, instr)) + return 0; + + *prefetch = (instr_lo == 0xF) && + (opcode == 0x0D || opcode == 0x18); + return 0; + default: + return 0; + } +} + +static bool is_amd_k8_pre_npt(void) +{ + struct cpuinfo_x86 *c = &boot_cpu_data; + + return unlikely(IS_ENABLED(CONFIG_CPU_SUP_AMD) && + c->x86_vendor == X86_VENDOR_AMD && + c->x86 == 0xf && c->x86_model < 0x40); +} + +static int +is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) +{ + unsigned char *max_instr; + unsigned char *instr; + int prefetch = 0; + + /* Erratum #91 affects AMD K8, pre-NPT CPUs */ + if (!is_amd_k8_pre_npt()) + return 0; + + /* + * If it was a exec (instruction fetch) fault on NX page, then + * do not ignore the fault: + */ + if (error_code & X86_PF_INSTR) + return 0; + + instr = (void *)convert_ip_to_linear(current, regs); + max_instr = instr + 15; + + /* + * This code has historically always bailed out if IP points to a + * not-present page (e.g. due to a race). No one has ever + * complained about this. + */ + pagefault_disable(); + + while (instr < max_instr) { + unsigned char opcode; + + if (user_mode(regs)) { + if (get_user(opcode, (unsigned char __user *) instr)) + break; + } else { + if (get_kernel_nofault(opcode, instr)) + break; + } + + instr++; + + if (!check_prefetch_opcode(regs, instr, opcode, &prefetch)) + break; + } + + pagefault_enable(); + return prefetch; +} + +DEFINE_SPINLOCK(pgd_lock); +LIST_HEAD(pgd_list); + +#ifdef CONFIG_X86_32 +static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) +{ + unsigned index = pgd_index(address); + pgd_t *pgd_k; + p4d_t *p4d, *p4d_k; + pud_t *pud, *pud_k; + pmd_t *pmd, *pmd_k; + + pgd += index; + pgd_k = init_mm.pgd + index; + + if (!pgd_present(*pgd_k)) + return NULL; + + /* + * set_pgd(pgd, *pgd_k); here would be useless on PAE + * and redundant with the set_pmd() on non-PAE. As would + * set_p4d/set_pud. + */ + p4d = p4d_offset(pgd, address); + p4d_k = p4d_offset(pgd_k, address); + if (!p4d_present(*p4d_k)) + return NULL; + + pud = pud_offset(p4d, address); + pud_k = pud_offset(p4d_k, address); + if (!pud_present(*pud_k)) + return NULL; + + pmd = pmd_offset(pud, address); + pmd_k = pmd_offset(pud_k, address); + + if (pmd_present(*pmd) != pmd_present(*pmd_k)) + set_pmd(pmd, *pmd_k); + + if (!pmd_present(*pmd_k)) + return NULL; + else + BUG_ON(pmd_pfn(*pmd) != pmd_pfn(*pmd_k)); + + return pmd_k; +} + +/* + * Handle a fault on the vmalloc or module mapping area + * + * This is needed because there is a race condition between the time + * when the vmalloc mapping code updates the PMD to the point in time + * where it synchronizes this update with the other page-tables in the + * system. + * + * In this race window another thread/CPU can map an area on the same + * PMD, finds it already present and does not synchronize it with the + * rest of the system yet. As a result v[mz]alloc might return areas + * which are not mapped in every page-table in the system, causing an + * unhandled page-fault when they are accessed. + */ +static noinline int vmalloc_fault(unsigned long address) +{ + unsigned long pgd_paddr; + pmd_t *pmd_k; + pte_t *pte_k; + + /* Make sure we are in vmalloc area: */ + if (!(address >= VMALLOC_START && address < VMALLOC_END)) + return -1; + + /* + * Synchronize this task's top level page-table + * with the 'reference' page table. + * + * Do _not_ use "current" here. We might be inside + * an interrupt in the middle of a task switch.. + */ + pgd_paddr = read_cr3_pa(); + pmd_k = vmalloc_sync_one(__va(pgd_paddr), address); + if (!pmd_k) + return -1; + + if (pmd_large(*pmd_k)) + return 0; + + pte_k = pte_offset_kernel(pmd_k, address); + if (!pte_present(*pte_k)) + return -1; + + return 0; +} +NOKPROBE_SYMBOL(vmalloc_fault); + +void arch_sync_kernel_mappings(unsigned long start, unsigned long end) +{ + unsigned long addr; + + for (addr = start & PMD_MASK; + addr >= TASK_SIZE_MAX && addr < VMALLOC_END; + addr += PMD_SIZE) { + struct page *page; + + spin_lock(&pgd_lock); + list_for_each_entry(page, &pgd_list, lru) { + spinlock_t *pgt_lock; + + /* the pgt_lock only for Xen */ + pgt_lock = &pgd_page_get_mm(page)->page_table_lock; + + spin_lock(pgt_lock); + vmalloc_sync_one(page_address(page), addr); + spin_unlock(pgt_lock); + } + spin_unlock(&pgd_lock); + } +} + +static bool low_pfn(unsigned long pfn) +{ + return pfn < max_low_pfn; +} + +static void dump_pagetable(unsigned long address) +{ + pgd_t *base = __va(read_cr3_pa()); + pgd_t *pgd = &base[pgd_index(address)]; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + +#ifdef CONFIG_X86_PAE + pr_info("*pdpt = %016Lx ", pgd_val(*pgd)); + if (!low_pfn(pgd_val(*pgd) >> PAGE_SHIFT) || !pgd_present(*pgd)) + goto out; +#define pr_pde pr_cont +#else +#define pr_pde pr_info +#endif + p4d = p4d_offset(pgd, address); + pud = pud_offset(p4d, address); + pmd = pmd_offset(pud, address); + pr_pde("*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)pmd_val(*pmd)); +#undef pr_pde + + /* + * We must not directly access the pte in the highpte + * case if the page table is located in highmem. + * And let's rather not kmap-atomic the pte, just in case + * it's allocated already: + */ + if (!low_pfn(pmd_pfn(*pmd)) || !pmd_present(*pmd) || pmd_large(*pmd)) + goto out; + + pte = pte_offset_kernel(pmd, address); + pr_cont("*pte = %0*Lx ", sizeof(*pte) * 2, (u64)pte_val(*pte)); +out: + pr_cont("\n"); +} + +#else /* CONFIG_X86_64: */ + +#ifdef CONFIG_CPU_SUP_AMD +static const char errata93_warning[] = +KERN_ERR +"******* Your BIOS seems to not contain a fix for K8 errata #93\n" +"******* Working around it, but it may cause SEGVs or burn power.\n" +"******* Please consider a BIOS update.\n" +"******* Disabling USB legacy in the BIOS may also help.\n"; +#endif + +static int bad_address(void *p) +{ + unsigned long dummy; + + return get_kernel_nofault(dummy, (unsigned long *)p); +} + +static void dump_pagetable(unsigned long address) +{ + pgd_t *base = __va(read_cr3_pa()); + pgd_t *pgd = base + pgd_index(address); + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + if (bad_address(pgd)) + goto bad; + + pr_info("PGD %lx ", pgd_val(*pgd)); + + if (!pgd_present(*pgd)) + goto out; + + p4d = p4d_offset(pgd, address); + if (bad_address(p4d)) + goto bad; + + pr_cont("P4D %lx ", p4d_val(*p4d)); + if (!p4d_present(*p4d) || p4d_large(*p4d)) + goto out; + + pud = pud_offset(p4d, address); + if (bad_address(pud)) + goto bad; + + pr_cont("PUD %lx ", pud_val(*pud)); + if (!pud_present(*pud) || pud_large(*pud)) + goto out; + + pmd = pmd_offset(pud, address); + if (bad_address(pmd)) + goto bad; + + pr_cont("PMD %lx ", pmd_val(*pmd)); + if (!pmd_present(*pmd) || pmd_large(*pmd)) + goto out; + + pte = pte_offset_kernel(pmd, address); + if (bad_address(pte)) + goto bad; + + pr_cont("PTE %lx", pte_val(*pte)); +out: + pr_cont("\n"); + return; +bad: + pr_info("BAD\n"); +} + +#endif /* CONFIG_X86_64 */ + +/* + * Workaround for K8 erratum #93 & buggy BIOS. + * + * BIOS SMM functions are required to use a specific workaround + * to avoid corruption of the 64bit RIP register on C stepping K8. + * + * A lot of BIOS that didn't get tested properly miss this. + * + * The OS sees this as a page fault with the upper 32bits of RIP cleared. + * Try to work around it here. + * + * Note we only handle faults in kernel here. + * Does nothing on 32-bit. + */ +static int is_errata93(struct pt_regs *regs, unsigned long address) +{ +#if defined(CONFIG_X86_64) && defined(CONFIG_CPU_SUP_AMD) + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD + || boot_cpu_data.x86 != 0xf) + return 0; + + if (user_mode(regs)) + return 0; + + if (address != regs->ip) + return 0; + + if ((address >> 32) != 0) + return 0; + + address |= 0xffffffffUL << 32; + if ((address >= (u64)_stext && address <= (u64)_etext) || + (address >= MODULES_VADDR && address <= MODULES_END)) { + printk_once(errata93_warning); + regs->ip = address; + return 1; + } +#endif + return 0; +} + +/* + * Work around K8 erratum #100 K8 in compat mode occasionally jumps + * to illegal addresses >4GB. + * + * We catch this in the page fault handler because these addresses + * are not reachable. Just detect this case and return. Any code + * segment in LDT is compatibility mode. + */ +static int is_errata100(struct pt_regs *regs, unsigned long address) +{ +#ifdef CONFIG_X86_64 + if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && (address >> 32)) + return 1; +#endif + return 0; +} + +/* Pentium F0 0F C7 C8 bug workaround: */ +static int is_f00f_bug(struct pt_regs *regs, unsigned long error_code, + unsigned long address) +{ +#ifdef CONFIG_X86_F00F_BUG + if (boot_cpu_has_bug(X86_BUG_F00F) && !(error_code & X86_PF_USER) && + idt_is_f00f_address(address)) { + handle_invalid_op(regs); + return 1; + } +#endif + return 0; +} + +static void show_ldttss(const struct desc_ptr *gdt, const char *name, u16 index) +{ + u32 offset = (index >> 3) * sizeof(struct desc_struct); + unsigned long addr; + struct ldttss_desc desc; + + if (index == 0) { + pr_alert("%s: NULL\n", name); + return; + } + + if (offset + sizeof(struct ldttss_desc) >= gdt->size) { + pr_alert("%s: 0x%hx -- out of bounds\n", name, index); + return; + } + + if (copy_from_kernel_nofault(&desc, (void *)(gdt->address + offset), + sizeof(struct ldttss_desc))) { + pr_alert("%s: 0x%hx -- GDT entry is not readable\n", + name, index); + return; + } + + addr = desc.base0 | (desc.base1 << 16) | ((unsigned long)desc.base2 << 24); +#ifdef CONFIG_X86_64 + addr |= ((u64)desc.base3 << 32); +#endif + pr_alert("%s: 0x%hx -- base=0x%lx limit=0x%x\n", + name, index, addr, (desc.limit0 | (desc.limit1 << 16))); +} + +static void +show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long address) +{ + if (!oops_may_print()) + return; + + if (error_code & X86_PF_INSTR) { + unsigned int level; + pgd_t *pgd; + pte_t *pte; + + pgd = __va(read_cr3_pa()); + pgd += pgd_index(address); + + pte = lookup_address_in_pgd(pgd, address, &level); + + if (pte && pte_present(*pte) && !pte_exec(*pte)) + pr_crit("kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n", + from_kuid(&init_user_ns, current_uid())); + if (pte && pte_present(*pte) && pte_exec(*pte) && + (pgd_flags(*pgd) & _PAGE_USER) && + (__read_cr4() & X86_CR4_SMEP)) + pr_crit("unable to execute userspace code (SMEP?) (uid: %d)\n", + from_kuid(&init_user_ns, current_uid())); + } + + if (address < PAGE_SIZE && !user_mode(regs)) + pr_alert("BUG: kernel NULL pointer dereference, address: %px\n", + (void *)address); + else + pr_alert("BUG: unable to handle page fault for address: %px\n", + (void *)address); + + pr_alert("#PF: %s %s in %s mode\n", + (error_code & X86_PF_USER) ? "user" : "supervisor", + (error_code & X86_PF_INSTR) ? "instruction fetch" : + (error_code & X86_PF_WRITE) ? "write access" : + "read access", + user_mode(regs) ? "user" : "kernel"); + pr_alert("#PF: error_code(0x%04lx) - %s\n", error_code, + !(error_code & X86_PF_PROT) ? "not-present page" : + (error_code & X86_PF_RSVD) ? "reserved bit violation" : + (error_code & X86_PF_PK) ? "protection keys violation" : + "permissions violation"); + + if (!(error_code & X86_PF_USER) && user_mode(regs)) { + struct desc_ptr idt, gdt; + u16 ldtr, tr; + + /* + * This can happen for quite a few reasons. The more obvious + * ones are faults accessing the GDT, or LDT. Perhaps + * surprisingly, if the CPU tries to deliver a benign or + * contributory exception from user code and gets a page fault + * during delivery, the page fault can be delivered as though + * it originated directly from user code. This could happen + * due to wrong permissions on the IDT, GDT, LDT, TSS, or + * kernel or IST stack. + */ + store_idt(&idt); + + /* Usable even on Xen PV -- it's just slow. */ + native_store_gdt(&gdt); + + pr_alert("IDT: 0x%lx (limit=0x%hx) GDT: 0x%lx (limit=0x%hx)\n", + idt.address, idt.size, gdt.address, gdt.size); + + store_ldt(ldtr); + show_ldttss(&gdt, "LDTR", ldtr); + + store_tr(tr); + show_ldttss(&gdt, "TR", tr); + } + + dump_pagetable(address); +} + +static noinline void +pgtable_bad(struct pt_regs *regs, unsigned long error_code, + unsigned long address) +{ + struct task_struct *tsk; + unsigned long flags; + int sig; + + flags = oops_begin(); + tsk = current; + sig = SIGKILL; + + printk(KERN_ALERT "%s: Corrupted page table at address %lx\n", + tsk->comm, address); + dump_pagetable(address); + + if (__die("Bad pagetable", regs, error_code)) + sig = 0; + + oops_end(flags, regs, sig); +} + +static void sanitize_error_code(unsigned long address, + unsigned long *error_code) +{ + /* + * To avoid leaking information about the kernel page + * table layout, pretend that user-mode accesses to + * kernel addresses are always protection faults. + * + * NB: This means that failed vsyscalls with vsyscall=none + * will have the PROT bit. This doesn't leak any + * information and does not appear to cause any problems. + */ + if (address >= TASK_SIZE_MAX) + *error_code |= X86_PF_PROT; +} + +static void set_signal_archinfo(unsigned long address, + unsigned long error_code) +{ + struct task_struct *tsk = current; + + tsk->thread.trap_nr = X86_TRAP_PF; + tsk->thread.error_code = error_code | X86_PF_USER; + tsk->thread.cr2 = address; +} + +static noinline void +page_fault_oops(struct pt_regs *regs, unsigned long error_code, + unsigned long address) +{ +#ifdef CONFIG_VMAP_STACK + struct stack_info info; +#endif + unsigned long flags; + int sig; + + if (user_mode(regs)) { + /* + * Implicit kernel access from user mode? Skip the stack + * overflow and EFI special cases. + */ + goto oops; + } + +#ifdef CONFIG_VMAP_STACK + /* + * Stack overflow? During boot, we can fault near the initial + * stack in the direct map, but that's not an overflow -- check + * that we're in vmalloc space to avoid this. + */ + if (is_vmalloc_addr((void *)address) && + get_stack_guard_info((void *)address, &info)) { + /* + * We're likely to be running with very little stack space + * left. It's plausible that we'd hit this condition but + * double-fault even before we get this far, in which case + * we're fine: the double-fault handler will deal with it. + * + * We don't want to make it all the way into the oops code + * and then double-fault, though, because we're likely to + * break the console driver and lose most of the stack dump. + */ + call_on_stack(__this_cpu_ist_top_va(DF) - sizeof(void*), + handle_stack_overflow, + ASM_CALL_ARG3, + , [arg1] "r" (regs), [arg2] "r" (address), [arg3] "r" (&info)); + + unreachable(); + } +#endif + + /* + * Buggy firmware could access regions which might page fault. If + * this happens, EFI has a special OOPS path that will try to + * avoid hanging the system. + */ + if (IS_ENABLED(CONFIG_EFI)) + efi_crash_gracefully_on_page_fault(address); + + /* Only not-present faults should be handled by KFENCE. */ + if (!(error_code & X86_PF_PROT) && + kfence_handle_page_fault(address, error_code & X86_PF_WRITE, regs)) + return; + +oops: + /* + * Oops. The kernel tried to access some bad page. We'll have to + * terminate things with extreme prejudice: + */ + flags = oops_begin(); + + show_fault_oops(regs, error_code, address); + + if (task_stack_end_corrupted(current)) + printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); + + sig = SIGKILL; + if (__die("Oops", regs, error_code)) + sig = 0; + + /* Executive summary in case the body of the oops scrolled away */ + printk(KERN_DEFAULT "CR2: %016lx\n", address); + + oops_end(flags, regs, sig); +} + +static noinline void +kernelmode_fixup_or_oops(struct pt_regs *regs, unsigned long error_code, + unsigned long address, int signal, int si_code, + u32 pkey) +{ + WARN_ON_ONCE(user_mode(regs)); + + /* Are we prepared to handle this kernel fault? */ + if (fixup_exception(regs, X86_TRAP_PF, error_code, address)) { + /* + * Any interrupt that takes a fault gets the fixup. This makes + * the below recursive fault logic only apply to a faults from + * task context. + */ + if (in_interrupt()) + return; + + /* + * Per the above we're !in_interrupt(), aka. task context. + * + * In this case we need to make sure we're not recursively + * faulting through the emulate_vsyscall() logic. + */ + if (current->thread.sig_on_uaccess_err && signal) { + sanitize_error_code(address, &error_code); + + set_signal_archinfo(address, error_code); + + if (si_code == SEGV_PKUERR) { + force_sig_pkuerr((void __user *)address, pkey); + } else { + /* XXX: hwpoison faults will set the wrong code. */ + force_sig_fault(signal, si_code, (void __user *)address); + } + } + + /* + * Barring that, we can do the fixup and be happy. + */ + return; + } + + /* + * AMD erratum #91 manifests as a spurious page fault on a PREFETCH + * instruction. + */ + if (is_prefetch(regs, error_code, address)) + return; + + page_fault_oops(regs, error_code, address); +} + +/* + * Print out info about fatal segfaults, if the show_unhandled_signals + * sysctl is set: + */ +static inline void +show_signal_msg(struct pt_regs *regs, unsigned long error_code, + unsigned long address, struct task_struct *tsk) +{ + const char *loglvl = task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG; + /* This is a racy snapshot, but it's better than nothing. */ + int cpu = raw_smp_processor_id(); + + if (!unhandled_signal(tsk, SIGSEGV)) + return; + + if (!printk_ratelimit()) + return; + + printk("%s%s[%d]: segfault at %lx ip %px sp %px error %lx", + loglvl, tsk->comm, task_pid_nr(tsk), address, + (void *)regs->ip, (void *)regs->sp, error_code); + + print_vma_addr(KERN_CONT " in ", regs->ip); + + /* + * Dump the likely CPU where the fatal segfault happened. + * This can help identify faulty hardware. + */ + printk(KERN_CONT " likely on CPU %d (core %d, socket %d)", cpu, + topology_core_id(cpu), topology_physical_package_id(cpu)); + + + printk(KERN_CONT "\n"); + + show_opcodes(regs, loglvl); +} + +/* + * The (legacy) vsyscall page is the long page in the kernel portion + * of the address space that has user-accessible permissions. + */ +static bool is_vsyscall_vaddr(unsigned long vaddr) +{ + return unlikely((vaddr & PAGE_MASK) == VSYSCALL_ADDR); +} + +static void +__bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, + unsigned long address, u32 pkey, int si_code) +{ + struct task_struct *tsk = current; + + if (!user_mode(regs)) { + kernelmode_fixup_or_oops(regs, error_code, address, + SIGSEGV, si_code, pkey); + return; + } + + if (!(error_code & X86_PF_USER)) { + /* Implicit user access to kernel memory -- just oops */ + page_fault_oops(regs, error_code, address); + return; + } + + /* + * User mode accesses just cause a SIGSEGV. + * It's possible to have interrupts off here: + */ + local_irq_enable(); + + /* + * Valid to do another page fault here because this one came + * from user space: + */ + if (is_prefetch(regs, error_code, address)) + return; + + if (is_errata100(regs, address)) + return; + + sanitize_error_code(address, &error_code); + + if (fixup_vdso_exception(regs, X86_TRAP_PF, error_code, address)) + return; + + if (likely(show_unhandled_signals)) + show_signal_msg(regs, error_code, address, tsk); + + set_signal_archinfo(address, error_code); + + if (si_code == SEGV_PKUERR) + force_sig_pkuerr((void __user *)address, pkey); + else + force_sig_fault(SIGSEGV, si_code, (void __user *)address); + + local_irq_disable(); +} + +static noinline void +bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, + unsigned long address) +{ + __bad_area_nosemaphore(regs, error_code, address, 0, SEGV_MAPERR); +} + +static void +__bad_area(struct pt_regs *regs, unsigned long error_code, + unsigned long address, u32 pkey, int si_code) +{ + struct mm_struct *mm = current->mm; + /* + * Something tried to access memory that isn't in our memory map.. + * Fix it, but check if it's kernel or user first.. + */ + mmap_read_unlock(mm); + + __bad_area_nosemaphore(regs, error_code, address, pkey, si_code); +} + +static inline bool bad_area_access_from_pkeys(unsigned long error_code, + struct vm_area_struct *vma) +{ + /* This code is always called on the current mm */ + bool foreign = false; + + if (!cpu_feature_enabled(X86_FEATURE_OSPKE)) + return false; + if (error_code & X86_PF_PK) + return true; + /* this checks permission keys on the VMA: */ + if (!arch_vma_access_permitted(vma, (error_code & X86_PF_WRITE), + (error_code & X86_PF_INSTR), foreign)) + return true; + return false; +} + +static noinline void +bad_area_access_error(struct pt_regs *regs, unsigned long error_code, + unsigned long address, struct vm_area_struct *vma) +{ + /* + * This OSPKE check is not strictly necessary at runtime. + * But, doing it this way allows compiler optimizations + * if pkeys are compiled out. + */ + if (bad_area_access_from_pkeys(error_code, vma)) { + /* + * A protection key fault means that the PKRU value did not allow + * access to some PTE. Userspace can figure out what PKRU was + * from the XSAVE state. This function captures the pkey from + * the vma and passes it to userspace so userspace can discover + * which protection key was set on the PTE. + * + * If we get here, we know that the hardware signaled a X86_PF_PK + * fault and that there was a VMA once we got in the fault + * handler. It does *not* guarantee that the VMA we find here + * was the one that we faulted on. + * + * 1. T1 : mprotect_key(foo, PAGE_SIZE, pkey=4); + * 2. T1 : set PKRU to deny access to pkey=4, touches page + * 3. T1 : faults... + * 4. T2: mprotect_key(foo, PAGE_SIZE, pkey=5); + * 5. T1 : enters fault handler, takes mmap_lock, etc... + * 6. T1 : reaches here, sees vma_pkey(vma)=5, when we really + * faulted on a pte with its pkey=4. + */ + u32 pkey = vma_pkey(vma); + + __bad_area(regs, error_code, address, pkey, SEGV_PKUERR); + } else { + __bad_area(regs, error_code, address, 0, SEGV_ACCERR); + } +} + +static void +do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, + vm_fault_t fault) +{ + /* Kernel mode? Handle exceptions or die: */ + if (!user_mode(regs)) { + kernelmode_fixup_or_oops(regs, error_code, address, + SIGBUS, BUS_ADRERR, ARCH_DEFAULT_PKEY); + return; + } + + /* User-space => ok to do another page fault: */ + if (is_prefetch(regs, error_code, address)) + return; + + sanitize_error_code(address, &error_code); + + if (fixup_vdso_exception(regs, X86_TRAP_PF, error_code, address)) + return; + + set_signal_archinfo(address, error_code); + +#ifdef CONFIG_MEMORY_FAILURE + if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) { + struct task_struct *tsk = current; + unsigned lsb = 0; + + pr_err( + "MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n", + tsk->comm, tsk->pid, address); + if (fault & VM_FAULT_HWPOISON_LARGE) + lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault)); + if (fault & VM_FAULT_HWPOISON) + lsb = PAGE_SHIFT; + force_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb); + return; + } +#endif + force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address); +} + +static int spurious_kernel_fault_check(unsigned long error_code, pte_t *pte) +{ + if ((error_code & X86_PF_WRITE) && !pte_write(*pte)) + return 0; + + if ((error_code & X86_PF_INSTR) && !pte_exec(*pte)) + return 0; + + return 1; +} + +/* + * Handle a spurious fault caused by a stale TLB entry. + * + * This allows us to lazily refresh the TLB when increasing the + * permissions of a kernel page (RO -> RW or NX -> X). Doing it + * eagerly is very expensive since that implies doing a full + * cross-processor TLB flush, even if no stale TLB entries exist + * on other processors. + * + * Spurious faults may only occur if the TLB contains an entry with + * fewer permission than the page table entry. Non-present (P = 0) + * and reserved bit (R = 1) faults are never spurious. + * + * There are no security implications to leaving a stale TLB when + * increasing the permissions on a page. + * + * Returns non-zero if a spurious fault was handled, zero otherwise. + * + * See Intel Developer's Manual Vol 3 Section 4.10.4.3, bullet 3 + * (Optional Invalidation). + */ +static noinline int +spurious_kernel_fault(unsigned long error_code, unsigned long address) +{ + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + int ret; + + /* + * Only writes to RO or instruction fetches from NX may cause + * spurious faults. + * + * These could be from user or supervisor accesses but the TLB + * is only lazily flushed after a kernel mapping protection + * change, so user accesses are not expected to cause spurious + * faults. + */ + if (error_code != (X86_PF_WRITE | X86_PF_PROT) && + error_code != (X86_PF_INSTR | X86_PF_PROT)) + return 0; + + pgd = init_mm.pgd + pgd_index(address); + if (!pgd_present(*pgd)) + return 0; + + p4d = p4d_offset(pgd, address); + if (!p4d_present(*p4d)) + return 0; + + if (p4d_large(*p4d)) + return spurious_kernel_fault_check(error_code, (pte_t *) p4d); + + pud = pud_offset(p4d, address); + if (!pud_present(*pud)) + return 0; + + if (pud_large(*pud)) + return spurious_kernel_fault_check(error_code, (pte_t *) pud); + + pmd = pmd_offset(pud, address); + if (!pmd_present(*pmd)) + return 0; + + if (pmd_large(*pmd)) + return spurious_kernel_fault_check(error_code, (pte_t *) pmd); + + pte = pte_offset_kernel(pmd, address); + if (!pte_present(*pte)) + return 0; + + ret = spurious_kernel_fault_check(error_code, pte); + if (!ret) + return 0; + + /* + * Make sure we have permissions in PMD. + * If not, then there's a bug in the page tables: + */ + ret = spurious_kernel_fault_check(error_code, (pte_t *) pmd); + WARN_ONCE(!ret, "PMD has incorrect permission bits\n"); + + return ret; +} +NOKPROBE_SYMBOL(spurious_kernel_fault); + +int show_unhandled_signals = 1; + +static inline int +access_error(unsigned long error_code, struct vm_area_struct *vma) +{ + /* This is only called for the current mm, so: */ + bool foreign = false; + + /* + * Read or write was blocked by protection keys. This is + * always an unconditional error and can never result in + * a follow-up action to resolve the fault, like a COW. + */ + if (error_code & X86_PF_PK) + return 1; + + /* + * SGX hardware blocked the access. This usually happens + * when the enclave memory contents have been destroyed, like + * after a suspend/resume cycle. In any case, the kernel can't + * fix the cause of the fault. Handle the fault as an access + * error even in cases where no actual access violation + * occurred. This allows userspace to rebuild the enclave in + * response to the signal. + */ + if (unlikely(error_code & X86_PF_SGX)) + return 1; + + /* + * Make sure to check the VMA so that we do not perform + * faults just to hit a X86_PF_PK as soon as we fill in a + * page. + */ + if (!arch_vma_access_permitted(vma, (error_code & X86_PF_WRITE), + (error_code & X86_PF_INSTR), foreign)) + return 1; + + /* + * Shadow stack accesses (PF_SHSTK=1) are only permitted to + * shadow stack VMAs. All other accesses result in an error. + */ + if (error_code & X86_PF_SHSTK) { + if (unlikely(!(vma->vm_flags & VM_SHADOW_STACK))) + return 1; + if (unlikely(!(vma->vm_flags & VM_WRITE))) + return 1; + return 0; + } + + if (error_code & X86_PF_WRITE) { + /* write, present and write, not present: */ + if (unlikely(vma->vm_flags & VM_SHADOW_STACK)) + return 1; + if (unlikely(!(vma->vm_flags & VM_WRITE))) + return 1; + return 0; + } + + /* read, present: */ + if (unlikely(error_code & X86_PF_PROT)) + return 1; + + /* read, not present: */ + if (unlikely(!vma_is_accessible(vma))) + return 1; + + return 0; +} + +bool fault_in_kernel_space(unsigned long address) +{ + /* + * On 64-bit systems, the vsyscall page is at an address above + * TASK_SIZE_MAX, but is not considered part of the kernel + * address space. + */ + if (IS_ENABLED(CONFIG_X86_64) && is_vsyscall_vaddr(address)) + return false; + + return address >= TASK_SIZE_MAX; +} + +/* + * Called for all faults where 'address' is part of the kernel address + * space. Might get called for faults that originate from *code* that + * ran in userspace or the kernel. + */ +static void +do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code, + unsigned long address) +{ + /* + * Protection keys exceptions only happen on user pages. We + * have no user pages in the kernel portion of the address + * space, so do not expect them here. + */ + WARN_ON_ONCE(hw_error_code & X86_PF_PK); + +#ifdef CONFIG_X86_32 + /* + * We can fault-in kernel-space virtual memory on-demand. The + * 'reference' page table is init_mm.pgd. + * + * NOTE! We MUST NOT take any locks for this case. We may + * be in an interrupt or a critical region, and should + * only copy the information from the master page table, + * nothing more. + * + * Before doing this on-demand faulting, ensure that the + * fault is not any of the following: + * 1. A fault on a PTE with a reserved bit set. + * 2. A fault caused by a user-mode access. (Do not demand- + * fault kernel memory due to user-mode accesses). + * 3. A fault caused by a page-level protection violation. + * (A demand fault would be on a non-present page which + * would have X86_PF_PROT==0). + * + * This is only needed to close a race condition on x86-32 in + * the vmalloc mapping/unmapping code. See the comment above + * vmalloc_fault() for details. On x86-64 the race does not + * exist as the vmalloc mappings don't need to be synchronized + * there. + */ + if (!(hw_error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) { + if (vmalloc_fault(address) >= 0) + return; + } +#endif + + if (is_f00f_bug(regs, hw_error_code, address)) + return; + + /* Was the fault spurious, caused by lazy TLB invalidation? */ + if (spurious_kernel_fault(hw_error_code, address)) + return; + + /* kprobes don't want to hook the spurious faults: */ + if (WARN_ON_ONCE(kprobe_page_fault(regs, X86_TRAP_PF))) + return; + + /* + * Note, despite being a "bad area", there are quite a few + * acceptable reasons to get here, such as erratum fixups + * and handling kernel code that can fault, like get_user(). + * + * Don't take the mm semaphore here. If we fixup a prefetch + * fault we could otherwise deadlock: + */ + bad_area_nosemaphore(regs, hw_error_code, address); +} +NOKPROBE_SYMBOL(do_kern_addr_fault); + +/* + * Handle faults in the user portion of the address space. Nothing in here + * should check X86_PF_USER without a specific justification: for almost + * all purposes, we should treat a normal kernel access to user memory + * (e.g. get_user(), put_user(), etc.) the same as the WRUSS instruction. + * The one exception is AC flag handling, which is, per the x86 + * architecture, special for WRUSS. + */ +static inline +void do_user_addr_fault(struct pt_regs *regs, + unsigned long error_code, + unsigned long address) +{ + struct vm_area_struct *vma; + struct task_struct *tsk; + struct mm_struct *mm; + vm_fault_t fault; + unsigned int flags = FAULT_FLAG_DEFAULT; + + tsk = current; + mm = tsk->mm; + + if (unlikely((error_code & (X86_PF_USER | X86_PF_INSTR)) == X86_PF_INSTR)) { + /* + * Whoops, this is kernel mode code trying to execute from + * user memory. Unless this is AMD erratum #93, which + * corrupts RIP such that it looks like a user address, + * this is unrecoverable. Don't even try to look up the + * VMA or look for extable entries. + */ + if (is_errata93(regs, address)) + return; + + page_fault_oops(regs, error_code, address); + return; + } + + /* kprobes don't want to hook the spurious faults: */ + if (WARN_ON_ONCE(kprobe_page_fault(regs, X86_TRAP_PF))) + return; + + /* + * Reserved bits are never expected to be set on + * entries in the user portion of the page tables. + */ + if (unlikely(error_code & X86_PF_RSVD)) + pgtable_bad(regs, error_code, address); + + /* + * If SMAP is on, check for invalid kernel (supervisor) access to user + * pages in the user address space. The odd case here is WRUSS, + * which, according to the preliminary documentation, does not respect + * SMAP and will have the USER bit set so, in all cases, SMAP + * enforcement appears to be consistent with the USER bit. + */ + if (unlikely(cpu_feature_enabled(X86_FEATURE_SMAP) && + !(error_code & X86_PF_USER) && + !(regs->flags & X86_EFLAGS_AC))) { + /* + * No extable entry here. This was a kernel access to an + * invalid pointer. get_kernel_nofault() will not get here. + */ + page_fault_oops(regs, error_code, address); + return; + } + + /* + * If we're in an interrupt, have no user context or are running + * in a region with pagefaults disabled then we must not take the fault + */ + if (unlikely(faulthandler_disabled() || !mm)) { + bad_area_nosemaphore(regs, error_code, address); + return; + } + + /* + * It's safe to allow irq's after cr2 has been saved and the + * vmalloc fault has been handled. + * + * User-mode registers count as a user access even for any + * potential system fault or CPU buglet: + */ + if (user_mode(regs)) { + local_irq_enable(); + flags |= FAULT_FLAG_USER; + } else { + if (regs->flags & X86_EFLAGS_IF) + local_irq_enable(); + } + + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); + + /* + * Read-only permissions can not be expressed in shadow stack PTEs. + * Treat all shadow stack accesses as WRITE faults. This ensures + * that the MM will prepare everything (e.g., break COW) such that + * maybe_mkwrite() can create a proper shadow stack PTE. + */ + if (error_code & X86_PF_SHSTK) + flags |= FAULT_FLAG_WRITE; + if (error_code & X86_PF_WRITE) + flags |= FAULT_FLAG_WRITE; + if (error_code & X86_PF_INSTR) + flags |= FAULT_FLAG_INSTRUCTION; + +#ifdef CONFIG_X86_64 + /* + * Faults in the vsyscall page might need emulation. The + * vsyscall page is at a high address (>PAGE_OFFSET), but is + * considered to be part of the user address space. + * + * The vsyscall page does not have a "real" VMA, so do this + * emulation before we go searching for VMAs. + * + * PKRU never rejects instruction fetches, so we don't need + * to consider the PF_PK bit. + */ + if (is_vsyscall_vaddr(address)) { + if (emulate_vsyscall(error_code, regs, address)) + return; + } +#endif + + if (!(flags & FAULT_FLAG_USER)) + goto lock_mmap; + + vma = lock_vma_under_rcu(mm, address); + if (!vma) + goto lock_mmap; + + if (unlikely(access_error(error_code, vma))) { + vma_end_read(vma); + goto lock_mmap; + } + fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs); + if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED))) + vma_end_read(vma); + + if (!(fault & VM_FAULT_RETRY)) { + count_vm_vma_lock_event(VMA_LOCK_SUCCESS); + goto done; + } + count_vm_vma_lock_event(VMA_LOCK_RETRY); + + /* Quick path to respond to signals */ + if (fault_signal_pending(fault, regs)) { + if (!user_mode(regs)) + kernelmode_fixup_or_oops(regs, error_code, address, + SIGBUS, BUS_ADRERR, + ARCH_DEFAULT_PKEY); + return; + } +lock_mmap: + +retry: + vma = lock_mm_and_find_vma(mm, address, regs); + if (unlikely(!vma)) { + bad_area_nosemaphore(regs, error_code, address); + return; + } + + /* + * Ok, we have a good vm_area for this memory access, so + * we can handle it.. + */ + if (unlikely(access_error(error_code, vma))) { + bad_area_access_error(regs, error_code, address, vma); + return; + } + + /* + * If for any reason at all we couldn't handle the fault, + * make sure we exit gracefully rather than endlessly redo + * the fault. Since we never set FAULT_FLAG_RETRY_NOWAIT, if + * we get VM_FAULT_RETRY back, the mmap_lock has been unlocked. + * + * Note that handle_userfault() may also release and reacquire mmap_lock + * (and not return with VM_FAULT_RETRY), when returning to userland to + * repeat the page fault later with a VM_FAULT_NOPAGE retval + * (potentially after handling any pending signal during the return to + * userland). The return to userland is identified whenever + * FAULT_FLAG_USER|FAULT_FLAG_KILLABLE are both set in flags. + */ + fault = handle_mm_fault(vma, address, flags, regs); + + if (fault_signal_pending(fault, regs)) { + /* + * Quick path to respond to signals. The core mm code + * has unlocked the mm for us if we get here. + */ + if (!user_mode(regs)) + kernelmode_fixup_or_oops(regs, error_code, address, + SIGBUS, BUS_ADRERR, + ARCH_DEFAULT_PKEY); + return; + } + + /* The fault is fully completed (including releasing mmap lock) */ + if (fault & VM_FAULT_COMPLETED) + return; + + /* + * If we need to retry the mmap_lock has already been released, + * and if there is a fatal signal pending there is no guarantee + * that we made any progress. Handle this case first. + */ + if (unlikely(fault & VM_FAULT_RETRY)) { + flags |= FAULT_FLAG_TRIED; + goto retry; + } + + mmap_read_unlock(mm); +done: + if (likely(!(fault & VM_FAULT_ERROR))) + return; + + if (fatal_signal_pending(current) && !user_mode(regs)) { + kernelmode_fixup_or_oops(regs, error_code, address, + 0, 0, ARCH_DEFAULT_PKEY); + return; + } + + if (fault & VM_FAULT_OOM) { + /* Kernel mode? Handle exceptions or die: */ + if (!user_mode(regs)) { + kernelmode_fixup_or_oops(regs, error_code, address, + SIGSEGV, SEGV_MAPERR, + ARCH_DEFAULT_PKEY); + return; + } + + /* + * We ran out of memory, call the OOM killer, and return the + * userspace (which will retry the fault, or kill us if we got + * oom-killed): + */ + pagefault_out_of_memory(); + } else { + if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON| + VM_FAULT_HWPOISON_LARGE)) + do_sigbus(regs, error_code, address, fault); + else if (fault & VM_FAULT_SIGSEGV) + bad_area_nosemaphore(regs, error_code, address); + else + BUG(); + } +} +NOKPROBE_SYMBOL(do_user_addr_fault); + +static __always_inline void +trace_page_fault_entries(struct pt_regs *regs, unsigned long error_code, + unsigned long address) +{ + if (!trace_pagefault_enabled()) + return; + + if (user_mode(regs)) + trace_page_fault_user(address, regs, error_code); + else + trace_page_fault_kernel(address, regs, error_code); +} + +static __always_inline void +handle_page_fault(struct pt_regs *regs, unsigned long error_code, + unsigned long address) +{ + trace_page_fault_entries(regs, error_code, address); + + if (unlikely(kmmio_fault(regs, address))) + return; + + /* Was the fault on kernel-controlled part of the address space? */ + if (unlikely(fault_in_kernel_space(address))) { + do_kern_addr_fault(regs, error_code, address); + } else { + do_user_addr_fault(regs, error_code, address); + /* + * User address page fault handling might have reenabled + * interrupts. Fixing up all potential exit points of + * do_user_addr_fault() and its leaf functions is just not + * doable w/o creating an unholy mess or turning the code + * upside down. + */ + local_irq_disable(); + } +} + +DEFINE_IDTENTRY_RAW_ERRORCODE(exc_page_fault) +{ + unsigned long address = read_cr2(); + irqentry_state_t state; + + prefetchw(¤t->mm->mmap_lock); + + /* + * KVM uses #PF vector to deliver 'page not present' events to guests + * (asynchronous page fault mechanism). The event happens when a + * userspace task is trying to access some valid (from guest's point of + * view) memory which is not currently mapped by the host (e.g. the + * memory is swapped out). Note, the corresponding "page ready" event + * which is injected when the memory becomes available, is delivered via + * an interrupt mechanism and not a #PF exception + * (see arch/x86/kernel/kvm.c: sysvec_kvm_asyncpf_interrupt()). + * + * We are relying on the interrupted context being sane (valid RSP, + * relevant locks not held, etc.), which is fine as long as the + * interrupted context had IF=1. We are also relying on the KVM + * async pf type field and CR2 being read consistently instead of + * getting values from real and async page faults mixed up. + * + * Fingers crossed. + * + * The async #PF handling code takes care of idtentry handling + * itself. + */ + if (kvm_handle_async_pf(regs, (u32)address)) + return; + + /* + * Entry handling for valid #PF from kernel mode is slightly + * different: RCU is already watching and ct_irq_enter() must not + * be invoked because a kernel fault on a user space address might + * sleep. + * + * In case the fault hit a RCU idle region the conditional entry + * code reenabled RCU to avoid subsequent wreckage which helps + * debuggability. + */ + state = irqentry_enter(regs); + + instrumentation_begin(); + handle_page_fault(regs, error_code, address); + instrumentation_end(); + + irqentry_exit(regs, state); +} diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c new file mode 100644 index 0000000000..d9efa35711 --- /dev/null +++ b/arch/x86/mm/highmem_32.c @@ -0,0 +1,34 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <linux/highmem.h> +#include <linux/export.h> +#include <linux/swap.h> /* for totalram_pages */ +#include <linux/memblock.h> +#include <asm/numa.h> + +void __init set_highmem_pages_init(void) +{ + struct zone *zone; + int nid; + + /* + * Explicitly reset zone->managed_pages because set_highmem_pages_init() + * is invoked before memblock_free_all() + */ + reset_all_zones_managed_pages(); + for_each_zone(zone) { + unsigned long zone_start_pfn, zone_end_pfn; + + if (!is_highmem(zone)) + continue; + + zone_start_pfn = zone->zone_start_pfn; + zone_end_pfn = zone_start_pfn + zone->spanned_pages; + + nid = zone_to_nid(zone); + printk(KERN_INFO "Initializing %s for node %d (%08lx:%08lx)\n", + zone->name, nid, zone_start_pfn, zone_end_pfn); + + add_highpages_with_active_regions(nid, zone_start_pfn, + zone_end_pfn); + } +} diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c new file mode 100644 index 0000000000..5804bbae4f --- /dev/null +++ b/arch/x86/mm/hugetlbpage.c @@ -0,0 +1,174 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * IA-32 Huge TLB Page Support for Kernel. + * + * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com> + */ + +#include <linux/init.h> +#include <linux/fs.h> +#include <linux/mm.h> +#include <linux/sched/mm.h> +#include <linux/hugetlb.h> +#include <linux/pagemap.h> +#include <linux/err.h> +#include <linux/sysctl.h> +#include <linux/compat.h> +#include <asm/mman.h> +#include <asm/tlb.h> +#include <asm/tlbflush.h> +#include <asm/elf.h> + +/* + * pmd_huge() returns 1 if @pmd is hugetlb related entry, that is normal + * hugetlb entry or non-present (migration or hwpoisoned) hugetlb entry. + * Otherwise, returns 0. + */ +int pmd_huge(pmd_t pmd) +{ + return !pmd_none(pmd) && + (pmd_val(pmd) & (_PAGE_PRESENT|_PAGE_PSE)) != _PAGE_PRESENT; +} + +/* + * pud_huge() returns 1 if @pud is hugetlb related entry, that is normal + * hugetlb entry or non-present (migration or hwpoisoned) hugetlb entry. + * Otherwise, returns 0. + */ +int pud_huge(pud_t pud) +{ +#if CONFIG_PGTABLE_LEVELS > 2 + return !pud_none(pud) && + (pud_val(pud) & (_PAGE_PRESENT|_PAGE_PSE)) != _PAGE_PRESENT; +#else + return 0; +#endif +} + +#ifdef CONFIG_HUGETLB_PAGE +static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file, + unsigned long addr, unsigned long len, + unsigned long pgoff, unsigned long flags) +{ + struct hstate *h = hstate_file(file); + struct vm_unmapped_area_info info; + + info.flags = 0; + info.length = len; + info.low_limit = get_mmap_base(1); + + /* + * If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area + * in the full address space. + */ + info.high_limit = in_32bit_syscall() ? + task_size_32bit() : task_size_64bit(addr > DEFAULT_MAP_WINDOW); + + info.align_mask = PAGE_MASK & ~huge_page_mask(h); + info.align_offset = 0; + return vm_unmapped_area(&info); +} + +static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file, + unsigned long addr, unsigned long len, + unsigned long pgoff, unsigned long flags) +{ + struct hstate *h = hstate_file(file); + struct vm_unmapped_area_info info; + + info.flags = VM_UNMAPPED_AREA_TOPDOWN; + info.length = len; + info.low_limit = PAGE_SIZE; + info.high_limit = get_mmap_base(0); + + /* + * If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area + * in the full address space. + */ + if (addr > DEFAULT_MAP_WINDOW && !in_32bit_syscall()) + info.high_limit += TASK_SIZE_MAX - DEFAULT_MAP_WINDOW; + + info.align_mask = PAGE_MASK & ~huge_page_mask(h); + info.align_offset = 0; + addr = vm_unmapped_area(&info); + + /* + * A failed mmap() very likely causes application failure, + * so fall back to the bottom-up function here. This scenario + * can happen with large stack limits and large mmap() + * allocations. + */ + if (addr & ~PAGE_MASK) { + VM_BUG_ON(addr != -ENOMEM); + info.flags = 0; + info.low_limit = TASK_UNMAPPED_BASE; + info.high_limit = TASK_SIZE_LOW; + addr = vm_unmapped_area(&info); + } + + return addr; +} + +unsigned long +hugetlb_get_unmapped_area(struct file *file, unsigned long addr, + unsigned long len, unsigned long pgoff, unsigned long flags) +{ + struct hstate *h = hstate_file(file); + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + + if (len & ~huge_page_mask(h)) + return -EINVAL; + + if (len > TASK_SIZE) + return -ENOMEM; + + /* No address checking. See comment at mmap_address_hint_valid() */ + if (flags & MAP_FIXED) { + if (prepare_hugepage_range(file, addr, len)) + return -EINVAL; + return addr; + } + + if (addr) { + addr &= huge_page_mask(h); + if (!mmap_address_hint_valid(addr, len)) + goto get_unmapped_area; + + vma = find_vma(mm, addr); + if (!vma || addr + len <= vm_start_gap(vma)) + return addr; + } + +get_unmapped_area: + if (mm->get_unmapped_area == arch_get_unmapped_area) + return hugetlb_get_unmapped_area_bottomup(file, addr, len, + pgoff, flags); + else + return hugetlb_get_unmapped_area_topdown(file, addr, len, + pgoff, flags); +} +#endif /* CONFIG_HUGETLB_PAGE */ + +#ifdef CONFIG_X86_64 +bool __init arch_hugetlb_valid_size(unsigned long size) +{ + if (size == PMD_SIZE) + return true; + else if (size == PUD_SIZE && boot_cpu_has(X86_FEATURE_GBPAGES)) + return true; + else + return false; +} + +#ifdef CONFIG_CONTIG_ALLOC +static __init int gigantic_pages_init(void) +{ + /* With compaction or CMA we can allocate gigantic pages at runtime */ + if (boot_cpu_has(X86_FEATURE_GBPAGES)) + hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT); + return 0; +} +arch_initcall(gigantic_pages_init); +#endif +#endif diff --git a/arch/x86/mm/ident_map.c b/arch/x86/mm/ident_map.c new file mode 100644 index 0000000000..968d7005f4 --- /dev/null +++ b/arch/x86/mm/ident_map.c @@ -0,0 +1,147 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Helper routines for building identity mapping page tables. This is + * included by both the compressed kernel and the regular kernel. + */ + +static void ident_pmd_init(struct x86_mapping_info *info, pmd_t *pmd_page, + unsigned long addr, unsigned long end) +{ + addr &= PMD_MASK; + for (; addr < end; addr += PMD_SIZE) { + pmd_t *pmd = pmd_page + pmd_index(addr); + + if (pmd_present(*pmd)) + continue; + + set_pmd(pmd, __pmd((addr - info->offset) | info->page_flag)); + } +} + +static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page, + unsigned long addr, unsigned long end) +{ + unsigned long next; + + for (; addr < end; addr = next) { + pud_t *pud = pud_page + pud_index(addr); + pmd_t *pmd; + + next = (addr & PUD_MASK) + PUD_SIZE; + if (next > end) + next = end; + + if (info->direct_gbpages) { + pud_t pudval; + + if (pud_present(*pud)) + continue; + + addr &= PUD_MASK; + pudval = __pud((addr - info->offset) | info->page_flag); + set_pud(pud, pudval); + continue; + } + + if (pud_present(*pud)) { + pmd = pmd_offset(pud, 0); + ident_pmd_init(info, pmd, addr, next); + continue; + } + pmd = (pmd_t *)info->alloc_pgt_page(info->context); + if (!pmd) + return -ENOMEM; + ident_pmd_init(info, pmd, addr, next); + set_pud(pud, __pud(__pa(pmd) | info->kernpg_flag)); + } + + return 0; +} + +static int ident_p4d_init(struct x86_mapping_info *info, p4d_t *p4d_page, + unsigned long addr, unsigned long end) +{ + unsigned long next; + int result; + + for (; addr < end; addr = next) { + p4d_t *p4d = p4d_page + p4d_index(addr); + pud_t *pud; + + next = (addr & P4D_MASK) + P4D_SIZE; + if (next > end) + next = end; + + if (p4d_present(*p4d)) { + pud = pud_offset(p4d, 0); + result = ident_pud_init(info, pud, addr, next); + if (result) + return result; + + continue; + } + pud = (pud_t *)info->alloc_pgt_page(info->context); + if (!pud) + return -ENOMEM; + + result = ident_pud_init(info, pud, addr, next); + if (result) + return result; + + set_p4d(p4d, __p4d(__pa(pud) | info->kernpg_flag)); + } + + return 0; +} + +int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page, + unsigned long pstart, unsigned long pend) +{ + unsigned long addr = pstart + info->offset; + unsigned long end = pend + info->offset; + unsigned long next; + int result; + + /* Set the default pagetable flags if not supplied */ + if (!info->kernpg_flag) + info->kernpg_flag = _KERNPG_TABLE; + + /* Filter out unsupported __PAGE_KERNEL_* bits: */ + info->kernpg_flag &= __default_kernel_pte_mask; + + for (; addr < end; addr = next) { + pgd_t *pgd = pgd_page + pgd_index(addr); + p4d_t *p4d; + + next = (addr & PGDIR_MASK) + PGDIR_SIZE; + if (next > end) + next = end; + + if (pgd_present(*pgd)) { + p4d = p4d_offset(pgd, 0); + result = ident_p4d_init(info, p4d, addr, next); + if (result) + return result; + continue; + } + + p4d = (p4d_t *)info->alloc_pgt_page(info->context); + if (!p4d) + return -ENOMEM; + result = ident_p4d_init(info, p4d, addr, next); + if (result) + return result; + if (pgtable_l5_enabled()) { + set_pgd(pgd, __pgd(__pa(p4d) | info->kernpg_flag)); + } else { + /* + * With p4d folded, pgd is equal to p4d. + * The pgd entry has to point to the pud page table in this case. + */ + pud_t *pud = pud_offset(p4d, 0); + set_pgd(pgd, __pgd(__pa(pud) | info->kernpg_flag)); + } + } + + return 0; +} diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c new file mode 100644 index 0000000000..679893ea5e --- /dev/null +++ b/arch/x86/mm/init.c @@ -0,0 +1,1101 @@ +#include <linux/gfp.h> +#include <linux/initrd.h> +#include <linux/ioport.h> +#include <linux/swap.h> +#include <linux/memblock.h> +#include <linux/swapfile.h> +#include <linux/swapops.h> +#include <linux/kmemleak.h> +#include <linux/sched/task.h> + +#include <asm/set_memory.h> +#include <asm/cpu_device_id.h> +#include <asm/e820/api.h> +#include <asm/init.h> +#include <asm/page.h> +#include <asm/page_types.h> +#include <asm/sections.h> +#include <asm/setup.h> +#include <asm/tlbflush.h> +#include <asm/tlb.h> +#include <asm/proto.h> +#include <asm/dma.h> /* for MAX_DMA_PFN */ +#include <asm/kaslr.h> +#include <asm/hypervisor.h> +#include <asm/cpufeature.h> +#include <asm/pti.h> +#include <asm/text-patching.h> +#include <asm/memtype.h> +#include <asm/paravirt.h> + +/* + * We need to define the tracepoints somewhere, and tlb.c + * is only compiled when SMP=y. + */ +#include <trace/events/tlb.h> + +#include "mm_internal.h" + +/* + * Tables translating between page_cache_type_t and pte encoding. + * + * The default values are defined statically as minimal supported mode; + * WC and WT fall back to UC-. pat_init() updates these values to support + * more cache modes, WC and WT, when it is safe to do so. See pat_init() + * for the details. Note, __early_ioremap() used during early boot-time + * takes pgprot_t (pte encoding) and does not use these tables. + * + * Index into __cachemode2pte_tbl[] is the cachemode. + * + * Index into __pte2cachemode_tbl[] are the caching attribute bits of the pte + * (_PAGE_PWT, _PAGE_PCD, _PAGE_PAT) at index bit positions 0, 1, 2. + */ +static uint16_t __cachemode2pte_tbl[_PAGE_CACHE_MODE_NUM] = { + [_PAGE_CACHE_MODE_WB ] = 0 | 0 , + [_PAGE_CACHE_MODE_WC ] = 0 | _PAGE_PCD, + [_PAGE_CACHE_MODE_UC_MINUS] = 0 | _PAGE_PCD, + [_PAGE_CACHE_MODE_UC ] = _PAGE_PWT | _PAGE_PCD, + [_PAGE_CACHE_MODE_WT ] = 0 | _PAGE_PCD, + [_PAGE_CACHE_MODE_WP ] = 0 | _PAGE_PCD, +}; + +unsigned long cachemode2protval(enum page_cache_mode pcm) +{ + if (likely(pcm == 0)) + return 0; + return __cachemode2pte_tbl[pcm]; +} +EXPORT_SYMBOL(cachemode2protval); + +static uint8_t __pte2cachemode_tbl[8] = { + [__pte2cm_idx( 0 | 0 | 0 )] = _PAGE_CACHE_MODE_WB, + [__pte2cm_idx(_PAGE_PWT | 0 | 0 )] = _PAGE_CACHE_MODE_UC_MINUS, + [__pte2cm_idx( 0 | _PAGE_PCD | 0 )] = _PAGE_CACHE_MODE_UC_MINUS, + [__pte2cm_idx(_PAGE_PWT | _PAGE_PCD | 0 )] = _PAGE_CACHE_MODE_UC, + [__pte2cm_idx( 0 | 0 | _PAGE_PAT)] = _PAGE_CACHE_MODE_WB, + [__pte2cm_idx(_PAGE_PWT | 0 | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC_MINUS, + [__pte2cm_idx(0 | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC_MINUS, + [__pte2cm_idx(_PAGE_PWT | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC, +}; + +/* + * Check that the write-protect PAT entry is set for write-protect. + * To do this without making assumptions how PAT has been set up (Xen has + * another layout than the kernel), translate the _PAGE_CACHE_MODE_WP cache + * mode via the __cachemode2pte_tbl[] into protection bits (those protection + * bits will select a cache mode of WP or better), and then translate the + * protection bits back into the cache mode using __pte2cm_idx() and the + * __pte2cachemode_tbl[] array. This will return the really used cache mode. + */ +bool x86_has_pat_wp(void) +{ + uint16_t prot = __cachemode2pte_tbl[_PAGE_CACHE_MODE_WP]; + + return __pte2cachemode_tbl[__pte2cm_idx(prot)] == _PAGE_CACHE_MODE_WP; +} + +enum page_cache_mode pgprot2cachemode(pgprot_t pgprot) +{ + unsigned long masked; + + masked = pgprot_val(pgprot) & _PAGE_CACHE_MASK; + if (likely(masked == 0)) + return 0; + return __pte2cachemode_tbl[__pte2cm_idx(masked)]; +} + +static unsigned long __initdata pgt_buf_start; +static unsigned long __initdata pgt_buf_end; +static unsigned long __initdata pgt_buf_top; + +static unsigned long min_pfn_mapped; + +static bool __initdata can_use_brk_pgt = true; + +/* + * Pages returned are already directly mapped. + * + * Changing that is likely to break Xen, see commit: + * + * 279b706 x86,xen: introduce x86_init.mapping.pagetable_reserve + * + * for detailed information. + */ +__ref void *alloc_low_pages(unsigned int num) +{ + unsigned long pfn; + int i; + + if (after_bootmem) { + unsigned int order; + + order = get_order((unsigned long)num << PAGE_SHIFT); + return (void *)__get_free_pages(GFP_ATOMIC | __GFP_ZERO, order); + } + + if ((pgt_buf_end + num) > pgt_buf_top || !can_use_brk_pgt) { + unsigned long ret = 0; + + if (min_pfn_mapped < max_pfn_mapped) { + ret = memblock_phys_alloc_range( + PAGE_SIZE * num, PAGE_SIZE, + min_pfn_mapped << PAGE_SHIFT, + max_pfn_mapped << PAGE_SHIFT); + } + if (!ret && can_use_brk_pgt) + ret = __pa(extend_brk(PAGE_SIZE * num, PAGE_SIZE)); + + if (!ret) + panic("alloc_low_pages: can not alloc memory"); + + pfn = ret >> PAGE_SHIFT; + } else { + pfn = pgt_buf_end; + pgt_buf_end += num; + } + + for (i = 0; i < num; i++) { + void *adr; + + adr = __va((pfn + i) << PAGE_SHIFT); + clear_page(adr); + } + + return __va(pfn << PAGE_SHIFT); +} + +/* + * By default need to be able to allocate page tables below PGD firstly for + * the 0-ISA_END_ADDRESS range and secondly for the initial PMD_SIZE mapping. + * With KASLR memory randomization, depending on the machine e820 memory and the + * PUD alignment, twice that many pages may be needed when KASLR memory + * randomization is enabled. + */ + +#ifndef CONFIG_X86_5LEVEL +#define INIT_PGD_PAGE_TABLES 3 +#else +#define INIT_PGD_PAGE_TABLES 4 +#endif + +#ifndef CONFIG_RANDOMIZE_MEMORY +#define INIT_PGD_PAGE_COUNT (2 * INIT_PGD_PAGE_TABLES) +#else +#define INIT_PGD_PAGE_COUNT (4 * INIT_PGD_PAGE_TABLES) +#endif + +#define INIT_PGT_BUF_SIZE (INIT_PGD_PAGE_COUNT * PAGE_SIZE) +RESERVE_BRK(early_pgt_alloc, INIT_PGT_BUF_SIZE); +void __init early_alloc_pgt_buf(void) +{ + unsigned long tables = INIT_PGT_BUF_SIZE; + phys_addr_t base; + + base = __pa(extend_brk(tables, PAGE_SIZE)); + + pgt_buf_start = base >> PAGE_SHIFT; + pgt_buf_end = pgt_buf_start; + pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT); +} + +int after_bootmem; + +early_param_on_off("gbpages", "nogbpages", direct_gbpages, CONFIG_X86_DIRECT_GBPAGES); + +struct map_range { + unsigned long start; + unsigned long end; + unsigned page_size_mask; +}; + +static int page_size_mask; + +/* + * Save some of cr4 feature set we're using (e.g. Pentium 4MB + * enable and PPro Global page enable), so that any CPU's that boot + * up after us can get the correct flags. Invoked on the boot CPU. + */ +static inline void cr4_set_bits_and_update_boot(unsigned long mask) +{ + mmu_cr4_features |= mask; + if (trampoline_cr4_features) + *trampoline_cr4_features = mmu_cr4_features; + cr4_set_bits(mask); +} + +static void __init probe_page_size_mask(void) +{ + /* + * For pagealloc debugging, identity mapping will use small pages. + * This will simplify cpa(), which otherwise needs to support splitting + * large pages into small in interrupt context, etc. + */ + if (boot_cpu_has(X86_FEATURE_PSE) && !debug_pagealloc_enabled()) + page_size_mask |= 1 << PG_LEVEL_2M; + else + direct_gbpages = 0; + + /* Enable PSE if available */ + if (boot_cpu_has(X86_FEATURE_PSE)) + cr4_set_bits_and_update_boot(X86_CR4_PSE); + + /* Enable PGE if available */ + __supported_pte_mask &= ~_PAGE_GLOBAL; + if (boot_cpu_has(X86_FEATURE_PGE)) { + cr4_set_bits_and_update_boot(X86_CR4_PGE); + __supported_pte_mask |= _PAGE_GLOBAL; + } + + /* By the default is everything supported: */ + __default_kernel_pte_mask = __supported_pte_mask; + /* Except when with PTI where the kernel is mostly non-Global: */ + if (cpu_feature_enabled(X86_FEATURE_PTI)) + __default_kernel_pte_mask &= ~_PAGE_GLOBAL; + + /* Enable 1 GB linear kernel mappings if available: */ + if (direct_gbpages && boot_cpu_has(X86_FEATURE_GBPAGES)) { + printk(KERN_INFO "Using GB pages for direct mapping\n"); + page_size_mask |= 1 << PG_LEVEL_1G; + } else { + direct_gbpages = 0; + } +} + +#define INTEL_MATCH(_model) { .vendor = X86_VENDOR_INTEL, \ + .family = 6, \ + .model = _model, \ + } +/* + * INVLPG may not properly flush Global entries + * on these CPUs when PCIDs are enabled. + */ +static const struct x86_cpu_id invlpg_miss_ids[] = { + INTEL_MATCH(INTEL_FAM6_ALDERLAKE ), + INTEL_MATCH(INTEL_FAM6_ALDERLAKE_L ), + INTEL_MATCH(INTEL_FAM6_ATOM_GRACEMONT ), + INTEL_MATCH(INTEL_FAM6_RAPTORLAKE ), + INTEL_MATCH(INTEL_FAM6_RAPTORLAKE_P), + INTEL_MATCH(INTEL_FAM6_RAPTORLAKE_S), + {} +}; + +static void setup_pcid(void) +{ + if (!IS_ENABLED(CONFIG_X86_64)) + return; + + if (!boot_cpu_has(X86_FEATURE_PCID)) + return; + + if (x86_match_cpu(invlpg_miss_ids)) { + pr_info("Incomplete global flushes, disabling PCID"); + setup_clear_cpu_cap(X86_FEATURE_PCID); + return; + } + + if (boot_cpu_has(X86_FEATURE_PGE)) { + /* + * This can't be cr4_set_bits_and_update_boot() -- the + * trampoline code can't handle CR4.PCIDE and it wouldn't + * do any good anyway. Despite the name, + * cr4_set_bits_and_update_boot() doesn't actually cause + * the bits in question to remain set all the way through + * the secondary boot asm. + * + * Instead, we brute-force it and set CR4.PCIDE manually in + * start_secondary(). + */ + cr4_set_bits(X86_CR4_PCIDE); + } else { + /* + * flush_tlb_all(), as currently implemented, won't work if + * PCID is on but PGE is not. Since that combination + * doesn't exist on real hardware, there's no reason to try + * to fully support it, but it's polite to avoid corrupting + * data if we're on an improperly configured VM. + */ + setup_clear_cpu_cap(X86_FEATURE_PCID); + } +} + +#ifdef CONFIG_X86_32 +#define NR_RANGE_MR 3 +#else /* CONFIG_X86_64 */ +#define NR_RANGE_MR 5 +#endif + +static int __meminit save_mr(struct map_range *mr, int nr_range, + unsigned long start_pfn, unsigned long end_pfn, + unsigned long page_size_mask) +{ + if (start_pfn < end_pfn) { + if (nr_range >= NR_RANGE_MR) + panic("run out of range for init_memory_mapping\n"); + mr[nr_range].start = start_pfn<<PAGE_SHIFT; + mr[nr_range].end = end_pfn<<PAGE_SHIFT; + mr[nr_range].page_size_mask = page_size_mask; + nr_range++; + } + + return nr_range; +} + +/* + * adjust the page_size_mask for small range to go with + * big page size instead small one if nearby are ram too. + */ +static void __ref adjust_range_page_size_mask(struct map_range *mr, + int nr_range) +{ + int i; + + for (i = 0; i < nr_range; i++) { + if ((page_size_mask & (1<<PG_LEVEL_2M)) && + !(mr[i].page_size_mask & (1<<PG_LEVEL_2M))) { + unsigned long start = round_down(mr[i].start, PMD_SIZE); + unsigned long end = round_up(mr[i].end, PMD_SIZE); + +#ifdef CONFIG_X86_32 + if ((end >> PAGE_SHIFT) > max_low_pfn) + continue; +#endif + + if (memblock_is_region_memory(start, end - start)) + mr[i].page_size_mask |= 1<<PG_LEVEL_2M; + } + if ((page_size_mask & (1<<PG_LEVEL_1G)) && + !(mr[i].page_size_mask & (1<<PG_LEVEL_1G))) { + unsigned long start = round_down(mr[i].start, PUD_SIZE); + unsigned long end = round_up(mr[i].end, PUD_SIZE); + + if (memblock_is_region_memory(start, end - start)) + mr[i].page_size_mask |= 1<<PG_LEVEL_1G; + } + } +} + +static const char *page_size_string(struct map_range *mr) +{ + static const char str_1g[] = "1G"; + static const char str_2m[] = "2M"; + static const char str_4m[] = "4M"; + static const char str_4k[] = "4k"; + + if (mr->page_size_mask & (1<<PG_LEVEL_1G)) + return str_1g; + /* + * 32-bit without PAE has a 4M large page size. + * PG_LEVEL_2M is misnamed, but we can at least + * print out the right size in the string. + */ + if (IS_ENABLED(CONFIG_X86_32) && + !IS_ENABLED(CONFIG_X86_PAE) && + mr->page_size_mask & (1<<PG_LEVEL_2M)) + return str_4m; + + if (mr->page_size_mask & (1<<PG_LEVEL_2M)) + return str_2m; + + return str_4k; +} + +static int __meminit split_mem_range(struct map_range *mr, int nr_range, + unsigned long start, + unsigned long end) +{ + unsigned long start_pfn, end_pfn, limit_pfn; + unsigned long pfn; + int i; + + limit_pfn = PFN_DOWN(end); + + /* head if not big page alignment ? */ + pfn = start_pfn = PFN_DOWN(start); +#ifdef CONFIG_X86_32 + /* + * Don't use a large page for the first 2/4MB of memory + * because there are often fixed size MTRRs in there + * and overlapping MTRRs into large pages can cause + * slowdowns. + */ + if (pfn == 0) + end_pfn = PFN_DOWN(PMD_SIZE); + else + end_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE)); +#else /* CONFIG_X86_64 */ + end_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE)); +#endif + if (end_pfn > limit_pfn) + end_pfn = limit_pfn; + if (start_pfn < end_pfn) { + nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); + pfn = end_pfn; + } + + /* big page (2M) range */ + start_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE)); +#ifdef CONFIG_X86_32 + end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE)); +#else /* CONFIG_X86_64 */ + end_pfn = round_up(pfn, PFN_DOWN(PUD_SIZE)); + if (end_pfn > round_down(limit_pfn, PFN_DOWN(PMD_SIZE))) + end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE)); +#endif + + if (start_pfn < end_pfn) { + nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, + page_size_mask & (1<<PG_LEVEL_2M)); + pfn = end_pfn; + } + +#ifdef CONFIG_X86_64 + /* big page (1G) range */ + start_pfn = round_up(pfn, PFN_DOWN(PUD_SIZE)); + end_pfn = round_down(limit_pfn, PFN_DOWN(PUD_SIZE)); + if (start_pfn < end_pfn) { + nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, + page_size_mask & + ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G))); + pfn = end_pfn; + } + + /* tail is not big page (1G) alignment */ + start_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE)); + end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE)); + if (start_pfn < end_pfn) { + nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, + page_size_mask & (1<<PG_LEVEL_2M)); + pfn = end_pfn; + } +#endif + + /* tail is not big page (2M) alignment */ + start_pfn = pfn; + end_pfn = limit_pfn; + nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); + + if (!after_bootmem) + adjust_range_page_size_mask(mr, nr_range); + + /* try to merge same page size and continuous */ + for (i = 0; nr_range > 1 && i < nr_range - 1; i++) { + unsigned long old_start; + if (mr[i].end != mr[i+1].start || + mr[i].page_size_mask != mr[i+1].page_size_mask) + continue; + /* move it */ + old_start = mr[i].start; + memmove(&mr[i], &mr[i+1], + (nr_range - 1 - i) * sizeof(struct map_range)); + mr[i--].start = old_start; + nr_range--; + } + + for (i = 0; i < nr_range; i++) + pr_debug(" [mem %#010lx-%#010lx] page %s\n", + mr[i].start, mr[i].end - 1, + page_size_string(&mr[i])); + + return nr_range; +} + +struct range pfn_mapped[E820_MAX_ENTRIES]; +int nr_pfn_mapped; + +static void add_pfn_range_mapped(unsigned long start_pfn, unsigned long end_pfn) +{ + nr_pfn_mapped = add_range_with_merge(pfn_mapped, E820_MAX_ENTRIES, + nr_pfn_mapped, start_pfn, end_pfn); + nr_pfn_mapped = clean_sort_range(pfn_mapped, E820_MAX_ENTRIES); + + max_pfn_mapped = max(max_pfn_mapped, end_pfn); + + if (start_pfn < (1UL<<(32-PAGE_SHIFT))) + max_low_pfn_mapped = max(max_low_pfn_mapped, + min(end_pfn, 1UL<<(32-PAGE_SHIFT))); +} + +bool pfn_range_is_mapped(unsigned long start_pfn, unsigned long end_pfn) +{ + int i; + + for (i = 0; i < nr_pfn_mapped; i++) + if ((start_pfn >= pfn_mapped[i].start) && + (end_pfn <= pfn_mapped[i].end)) + return true; + + return false; +} + +/* + * Setup the direct mapping of the physical memory at PAGE_OFFSET. + * This runs before bootmem is initialized and gets pages directly from + * the physical memory. To access them they are temporarily mapped. + */ +unsigned long __ref init_memory_mapping(unsigned long start, + unsigned long end, pgprot_t prot) +{ + struct map_range mr[NR_RANGE_MR]; + unsigned long ret = 0; + int nr_range, i; + + pr_debug("init_memory_mapping: [mem %#010lx-%#010lx]\n", + start, end - 1); + + memset(mr, 0, sizeof(mr)); + nr_range = split_mem_range(mr, 0, start, end); + + for (i = 0; i < nr_range; i++) + ret = kernel_physical_mapping_init(mr[i].start, mr[i].end, + mr[i].page_size_mask, + prot); + + add_pfn_range_mapped(start >> PAGE_SHIFT, ret >> PAGE_SHIFT); + + return ret >> PAGE_SHIFT; +} + +/* + * We need to iterate through the E820 memory map and create direct mappings + * for only E820_TYPE_RAM and E820_KERN_RESERVED regions. We cannot simply + * create direct mappings for all pfns from [0 to max_low_pfn) and + * [4GB to max_pfn) because of possible memory holes in high addresses + * that cannot be marked as UC by fixed/variable range MTRRs. + * Depending on the alignment of E820 ranges, this may possibly result + * in using smaller size (i.e. 4K instead of 2M or 1G) page tables. + * + * init_mem_mapping() calls init_range_memory_mapping() with big range. + * That range would have hole in the middle or ends, and only ram parts + * will be mapped in init_range_memory_mapping(). + */ +static unsigned long __init init_range_memory_mapping( + unsigned long r_start, + unsigned long r_end) +{ + unsigned long start_pfn, end_pfn; + unsigned long mapped_ram_size = 0; + int i; + + for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) { + u64 start = clamp_val(PFN_PHYS(start_pfn), r_start, r_end); + u64 end = clamp_val(PFN_PHYS(end_pfn), r_start, r_end); + if (start >= end) + continue; + + /* + * if it is overlapping with brk pgt, we need to + * alloc pgt buf from memblock instead. + */ + can_use_brk_pgt = max(start, (u64)pgt_buf_end<<PAGE_SHIFT) >= + min(end, (u64)pgt_buf_top<<PAGE_SHIFT); + init_memory_mapping(start, end, PAGE_KERNEL); + mapped_ram_size += end - start; + can_use_brk_pgt = true; + } + + return mapped_ram_size; +} + +static unsigned long __init get_new_step_size(unsigned long step_size) +{ + /* + * Initial mapped size is PMD_SIZE (2M). + * We can not set step_size to be PUD_SIZE (1G) yet. + * In worse case, when we cross the 1G boundary, and + * PG_LEVEL_2M is not set, we will need 1+1+512 pages (2M + 8k) + * to map 1G range with PTE. Hence we use one less than the + * difference of page table level shifts. + * + * Don't need to worry about overflow in the top-down case, on 32bit, + * when step_size is 0, round_down() returns 0 for start, and that + * turns it into 0x100000000ULL. + * In the bottom-up case, round_up(x, 0) returns 0 though too, which + * needs to be taken into consideration by the code below. + */ + return step_size << (PMD_SHIFT - PAGE_SHIFT - 1); +} + +/** + * memory_map_top_down - Map [map_start, map_end) top down + * @map_start: start address of the target memory range + * @map_end: end address of the target memory range + * + * This function will setup direct mapping for memory range + * [map_start, map_end) in top-down. That said, the page tables + * will be allocated at the end of the memory, and we map the + * memory in top-down. + */ +static void __init memory_map_top_down(unsigned long map_start, + unsigned long map_end) +{ + unsigned long real_end, last_start; + unsigned long step_size; + unsigned long addr; + unsigned long mapped_ram_size = 0; + + /* + * Systems that have many reserved areas near top of the memory, + * e.g. QEMU with less than 1G RAM and EFI enabled, or Xen, will + * require lots of 4K mappings which may exhaust pgt_buf. + * Start with top-most PMD_SIZE range aligned at PMD_SIZE to ensure + * there is enough mapped memory that can be allocated from + * memblock. + */ + addr = memblock_phys_alloc_range(PMD_SIZE, PMD_SIZE, map_start, + map_end); + memblock_phys_free(addr, PMD_SIZE); + real_end = addr + PMD_SIZE; + + /* step_size need to be small so pgt_buf from BRK could cover it */ + step_size = PMD_SIZE; + max_pfn_mapped = 0; /* will get exact value next */ + min_pfn_mapped = real_end >> PAGE_SHIFT; + last_start = real_end; + + /* + * We start from the top (end of memory) and go to the bottom. + * The memblock_find_in_range() gets us a block of RAM from the + * end of RAM in [min_pfn_mapped, max_pfn_mapped) used as new pages + * for page table. + */ + while (last_start > map_start) { + unsigned long start; + + if (last_start > step_size) { + start = round_down(last_start - 1, step_size); + if (start < map_start) + start = map_start; + } else + start = map_start; + mapped_ram_size += init_range_memory_mapping(start, + last_start); + last_start = start; + min_pfn_mapped = last_start >> PAGE_SHIFT; + if (mapped_ram_size >= step_size) + step_size = get_new_step_size(step_size); + } + + if (real_end < map_end) + init_range_memory_mapping(real_end, map_end); +} + +/** + * memory_map_bottom_up - Map [map_start, map_end) bottom up + * @map_start: start address of the target memory range + * @map_end: end address of the target memory range + * + * This function will setup direct mapping for memory range + * [map_start, map_end) in bottom-up. Since we have limited the + * bottom-up allocation above the kernel, the page tables will + * be allocated just above the kernel and we map the memory + * in [map_start, map_end) in bottom-up. + */ +static void __init memory_map_bottom_up(unsigned long map_start, + unsigned long map_end) +{ + unsigned long next, start; + unsigned long mapped_ram_size = 0; + /* step_size need to be small so pgt_buf from BRK could cover it */ + unsigned long step_size = PMD_SIZE; + + start = map_start; + min_pfn_mapped = start >> PAGE_SHIFT; + + /* + * We start from the bottom (@map_start) and go to the top (@map_end). + * The memblock_find_in_range() gets us a block of RAM from the + * end of RAM in [min_pfn_mapped, max_pfn_mapped) used as new pages + * for page table. + */ + while (start < map_end) { + if (step_size && map_end - start > step_size) { + next = round_up(start + 1, step_size); + if (next > map_end) + next = map_end; + } else { + next = map_end; + } + + mapped_ram_size += init_range_memory_mapping(start, next); + start = next; + + if (mapped_ram_size >= step_size) + step_size = get_new_step_size(step_size); + } +} + +/* + * The real mode trampoline, which is required for bootstrapping CPUs + * occupies only a small area under the low 1MB. See reserve_real_mode() + * for details. + * + * If KASLR is disabled the first PGD entry of the direct mapping is copied + * to map the real mode trampoline. + * + * If KASLR is enabled, copy only the PUD which covers the low 1MB + * area. This limits the randomization granularity to 1GB for both 4-level + * and 5-level paging. + */ +static void __init init_trampoline(void) +{ +#ifdef CONFIG_X86_64 + /* + * The code below will alias kernel page-tables in the user-range of the + * address space, including the Global bit. So global TLB entries will + * be created when using the trampoline page-table. + */ + if (!kaslr_memory_enabled()) + trampoline_pgd_entry = init_top_pgt[pgd_index(__PAGE_OFFSET)]; + else + init_trampoline_kaslr(); +#endif +} + +void __init init_mem_mapping(void) +{ + unsigned long end; + + pti_check_boottime_disable(); + probe_page_size_mask(); + setup_pcid(); + +#ifdef CONFIG_X86_64 + end = max_pfn << PAGE_SHIFT; +#else + end = max_low_pfn << PAGE_SHIFT; +#endif + + /* the ISA range is always mapped regardless of memory holes */ + init_memory_mapping(0, ISA_END_ADDRESS, PAGE_KERNEL); + + /* Init the trampoline, possibly with KASLR memory offset */ + init_trampoline(); + + /* + * If the allocation is in bottom-up direction, we setup direct mapping + * in bottom-up, otherwise we setup direct mapping in top-down. + */ + if (memblock_bottom_up()) { + unsigned long kernel_end = __pa_symbol(_end); + + /* + * we need two separate calls here. This is because we want to + * allocate page tables above the kernel. So we first map + * [kernel_end, end) to make memory above the kernel be mapped + * as soon as possible. And then use page tables allocated above + * the kernel to map [ISA_END_ADDRESS, kernel_end). + */ + memory_map_bottom_up(kernel_end, end); + memory_map_bottom_up(ISA_END_ADDRESS, kernel_end); + } else { + memory_map_top_down(ISA_END_ADDRESS, end); + } + +#ifdef CONFIG_X86_64 + if (max_pfn > max_low_pfn) { + /* can we preserve max_low_pfn ?*/ + max_low_pfn = max_pfn; + } +#else + early_ioremap_page_table_range_init(); +#endif + + load_cr3(swapper_pg_dir); + __flush_tlb_all(); + + x86_init.hyper.init_mem_mapping(); + + early_memtest(0, max_pfn_mapped << PAGE_SHIFT); +} + +/* + * Initialize an mm_struct to be used during poking and a pointer to be used + * during patching. + */ +void __init poking_init(void) +{ + spinlock_t *ptl; + pte_t *ptep; + + poking_mm = mm_alloc(); + BUG_ON(!poking_mm); + + /* Xen PV guests need the PGD to be pinned. */ + paravirt_enter_mmap(poking_mm); + + /* + * Randomize the poking address, but make sure that the following page + * will be mapped at the same PMD. We need 2 pages, so find space for 3, + * and adjust the address if the PMD ends after the first one. + */ + poking_addr = TASK_UNMAPPED_BASE; + if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) + poking_addr += (kaslr_get_random_long("Poking") & PAGE_MASK) % + (TASK_SIZE - TASK_UNMAPPED_BASE - 3 * PAGE_SIZE); + + if (((poking_addr + PAGE_SIZE) & ~PMD_MASK) == 0) + poking_addr += PAGE_SIZE; + + /* + * We need to trigger the allocation of the page-tables that will be + * needed for poking now. Later, poking may be performed in an atomic + * section, which might cause allocation to fail. + */ + ptep = get_locked_pte(poking_mm, poking_addr, &ptl); + BUG_ON(!ptep); + pte_unmap_unlock(ptep, ptl); +} + +/* + * devmem_is_allowed() checks to see if /dev/mem access to a certain address + * is valid. The argument is a physical page number. + * + * On x86, access has to be given to the first megabyte of RAM because that + * area traditionally contains BIOS code and data regions used by X, dosemu, + * and similar apps. Since they map the entire memory range, the whole range + * must be allowed (for mapping), but any areas that would otherwise be + * disallowed are flagged as being "zero filled" instead of rejected. + * Access has to be given to non-kernel-ram areas as well, these contain the + * PCI mmio resources as well as potential bios/acpi data regions. + */ +int devmem_is_allowed(unsigned long pagenr) +{ + if (region_intersects(PFN_PHYS(pagenr), PAGE_SIZE, + IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE) + != REGION_DISJOINT) { + /* + * For disallowed memory regions in the low 1MB range, + * request that the page be shown as all zeros. + */ + if (pagenr < 256) + return 2; + + return 0; + } + + /* + * This must follow RAM test, since System RAM is considered a + * restricted resource under CONFIG_STRICT_DEVMEM. + */ + if (iomem_is_exclusive(pagenr << PAGE_SHIFT)) { + /* Low 1MB bypasses iomem restrictions. */ + if (pagenr < 256) + return 1; + + return 0; + } + + return 1; +} + +void free_init_pages(const char *what, unsigned long begin, unsigned long end) +{ + unsigned long begin_aligned, end_aligned; + + /* Make sure boundaries are page aligned */ + begin_aligned = PAGE_ALIGN(begin); + end_aligned = end & PAGE_MASK; + + if (WARN_ON(begin_aligned != begin || end_aligned != end)) { + begin = begin_aligned; + end = end_aligned; + } + + if (begin >= end) + return; + + /* + * If debugging page accesses then do not free this memory but + * mark them not present - any buggy init-section access will + * create a kernel page fault: + */ + if (debug_pagealloc_enabled()) { + pr_info("debug: unmapping init [mem %#010lx-%#010lx]\n", + begin, end - 1); + /* + * Inform kmemleak about the hole in the memory since the + * corresponding pages will be unmapped. + */ + kmemleak_free_part((void *)begin, end - begin); + set_memory_np(begin, (end - begin) >> PAGE_SHIFT); + } else { + /* + * We just marked the kernel text read only above, now that + * we are going to free part of that, we need to make that + * writeable and non-executable first. + */ + set_memory_nx(begin, (end - begin) >> PAGE_SHIFT); + set_memory_rw(begin, (end - begin) >> PAGE_SHIFT); + + free_reserved_area((void *)begin, (void *)end, + POISON_FREE_INITMEM, what); + } +} + +/* + * begin/end can be in the direct map or the "high kernel mapping" + * used for the kernel image only. free_init_pages() will do the + * right thing for either kind of address. + */ +void free_kernel_image_pages(const char *what, void *begin, void *end) +{ + unsigned long begin_ul = (unsigned long)begin; + unsigned long end_ul = (unsigned long)end; + unsigned long len_pages = (end_ul - begin_ul) >> PAGE_SHIFT; + + free_init_pages(what, begin_ul, end_ul); + + /* + * PTI maps some of the kernel into userspace. For performance, + * this includes some kernel areas that do not contain secrets. + * Those areas might be adjacent to the parts of the kernel image + * being freed, which may contain secrets. Remove the "high kernel + * image mapping" for these freed areas, ensuring they are not even + * potentially vulnerable to Meltdown regardless of the specific + * optimizations PTI is currently using. + * + * The "noalias" prevents unmapping the direct map alias which is + * needed to access the freed pages. + * + * This is only valid for 64bit kernels. 32bit has only one mapping + * which can't be treated in this way for obvious reasons. + */ + if (IS_ENABLED(CONFIG_X86_64) && cpu_feature_enabled(X86_FEATURE_PTI)) + set_memory_np_noalias(begin_ul, len_pages); +} + +void __ref free_initmem(void) +{ + e820__reallocate_tables(); + + mem_encrypt_free_decrypted_mem(); + + free_kernel_image_pages("unused kernel image (initmem)", + &__init_begin, &__init_end); +} + +#ifdef CONFIG_BLK_DEV_INITRD +void __init free_initrd_mem(unsigned long start, unsigned long end) +{ + /* + * end could be not aligned, and We can not align that, + * decompressor could be confused by aligned initrd_end + * We already reserve the end partial page before in + * - i386_start_kernel() + * - x86_64_start_kernel() + * - relocate_initrd() + * So here We can do PAGE_ALIGN() safely to get partial page to be freed + */ + free_init_pages("initrd", start, PAGE_ALIGN(end)); +} +#endif + +/* + * Calculate the precise size of the DMA zone (first 16 MB of RAM), + * and pass it to the MM layer - to help it set zone watermarks more + * accurately. + * + * Done on 64-bit systems only for the time being, although 32-bit systems + * might benefit from this as well. + */ +void __init memblock_find_dma_reserve(void) +{ +#ifdef CONFIG_X86_64 + u64 nr_pages = 0, nr_free_pages = 0; + unsigned long start_pfn, end_pfn; + phys_addr_t start_addr, end_addr; + int i; + u64 u; + + /* + * Iterate over all memory ranges (free and reserved ones alike), + * to calculate the total number of pages in the first 16 MB of RAM: + */ + nr_pages = 0; + for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) { + start_pfn = min(start_pfn, MAX_DMA_PFN); + end_pfn = min(end_pfn, MAX_DMA_PFN); + + nr_pages += end_pfn - start_pfn; + } + + /* + * Iterate over free memory ranges to calculate the number of free + * pages in the DMA zone, while not counting potential partial + * pages at the beginning or the end of the range: + */ + nr_free_pages = 0; + for_each_free_mem_range(u, NUMA_NO_NODE, MEMBLOCK_NONE, &start_addr, &end_addr, NULL) { + start_pfn = min_t(unsigned long, PFN_UP(start_addr), MAX_DMA_PFN); + end_pfn = min_t(unsigned long, PFN_DOWN(end_addr), MAX_DMA_PFN); + + if (start_pfn < end_pfn) + nr_free_pages += end_pfn - start_pfn; + } + + set_dma_reserve(nr_pages - nr_free_pages); +#endif +} + +void __init zone_sizes_init(void) +{ + unsigned long max_zone_pfns[MAX_NR_ZONES]; + + memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); + +#ifdef CONFIG_ZONE_DMA + max_zone_pfns[ZONE_DMA] = min(MAX_DMA_PFN, max_low_pfn); +#endif +#ifdef CONFIG_ZONE_DMA32 + max_zone_pfns[ZONE_DMA32] = min(MAX_DMA32_PFN, max_low_pfn); +#endif + max_zone_pfns[ZONE_NORMAL] = max_low_pfn; +#ifdef CONFIG_HIGHMEM + max_zone_pfns[ZONE_HIGHMEM] = max_pfn; +#endif + + free_area_init(max_zone_pfns); +} + +__visible DEFINE_PER_CPU_ALIGNED(struct tlb_state, cpu_tlbstate) = { + .loaded_mm = &init_mm, + .next_asid = 1, + .cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */ +}; + +#ifdef CONFIG_ADDRESS_MASKING +DEFINE_PER_CPU(u64, tlbstate_untag_mask); +EXPORT_PER_CPU_SYMBOL(tlbstate_untag_mask); +#endif + +void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache) +{ + /* entry 0 MUST be WB (hardwired to speed up translations) */ + BUG_ON(!entry && cache != _PAGE_CACHE_MODE_WB); + + __cachemode2pte_tbl[cache] = __cm_idx2pte(entry); + __pte2cachemode_tbl[entry] = cache; +} + +#ifdef CONFIG_SWAP +unsigned long arch_max_swapfile_size(void) +{ + unsigned long pages; + + pages = generic_max_swapfile_size(); + + if (boot_cpu_has_bug(X86_BUG_L1TF) && l1tf_mitigation != L1TF_MITIGATION_OFF) { + /* Limit the swap file size to MAX_PA/2 for L1TF workaround */ + unsigned long long l1tf_limit = l1tf_pfn_limit(); + /* + * We encode swap offsets also with 3 bits below those for pfn + * which makes the usable limit higher. + */ +#if CONFIG_PGTABLE_LEVELS > 2 + l1tf_limit <<= PAGE_SHIFT - SWP_OFFSET_FIRST_BIT; +#endif + pages = min_t(unsigned long long, l1tf_limit, pages); + } + return pages; +} +#endif diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c new file mode 100644 index 0000000000..b63403d717 --- /dev/null +++ b/arch/x86/mm/init_32.c @@ -0,0 +1,805 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * + * Copyright (C) 1995 Linus Torvalds + * + * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 + */ + +#include <linux/signal.h> +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/string.h> +#include <linux/types.h> +#include <linux/ptrace.h> +#include <linux/mman.h> +#include <linux/mm.h> +#include <linux/hugetlb.h> +#include <linux/swap.h> +#include <linux/smp.h> +#include <linux/init.h> +#include <linux/highmem.h> +#include <linux/pagemap.h> +#include <linux/pci.h> +#include <linux/pfn.h> +#include <linux/poison.h> +#include <linux/memblock.h> +#include <linux/proc_fs.h> +#include <linux/memory_hotplug.h> +#include <linux/initrd.h> +#include <linux/cpumask.h> +#include <linux/gfp.h> + +#include <asm/asm.h> +#include <asm/bios_ebda.h> +#include <asm/processor.h> +#include <linux/uaccess.h> +#include <asm/dma.h> +#include <asm/fixmap.h> +#include <asm/e820/api.h> +#include <asm/apic.h> +#include <asm/bugs.h> +#include <asm/tlb.h> +#include <asm/tlbflush.h> +#include <asm/olpc_ofw.h> +#include <asm/pgalloc.h> +#include <asm/sections.h> +#include <asm/setup.h> +#include <asm/set_memory.h> +#include <asm/page_types.h> +#include <asm/cpu_entry_area.h> +#include <asm/init.h> +#include <asm/pgtable_areas.h> +#include <asm/numa.h> + +#include "mm_internal.h" + +unsigned long highstart_pfn, highend_pfn; + +bool __read_mostly __vmalloc_start_set = false; + +/* + * Creates a middle page table and puts a pointer to it in the + * given global directory entry. This only returns the gd entry + * in non-PAE compilation mode, since the middle layer is folded. + */ +static pmd_t * __init one_md_table_init(pgd_t *pgd) +{ + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd_table; + +#ifdef CONFIG_X86_PAE + if (!(pgd_val(*pgd) & _PAGE_PRESENT)) { + pmd_table = (pmd_t *)alloc_low_page(); + set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); + p4d = p4d_offset(pgd, 0); + pud = pud_offset(p4d, 0); + BUG_ON(pmd_table != pmd_offset(pud, 0)); + + return pmd_table; + } +#endif + p4d = p4d_offset(pgd, 0); + pud = pud_offset(p4d, 0); + pmd_table = pmd_offset(pud, 0); + + return pmd_table; +} + +/* + * Create a page table and place a pointer to it in a middle page + * directory entry: + */ +static pte_t * __init one_page_table_init(pmd_t *pmd) +{ + if (!(pmd_val(*pmd) & _PAGE_PRESENT)) { + pte_t *page_table = (pte_t *)alloc_low_page(); + + set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE)); + BUG_ON(page_table != pte_offset_kernel(pmd, 0)); + } + + return pte_offset_kernel(pmd, 0); +} + +pmd_t * __init populate_extra_pmd(unsigned long vaddr) +{ + int pgd_idx = pgd_index(vaddr); + int pmd_idx = pmd_index(vaddr); + + return one_md_table_init(swapper_pg_dir + pgd_idx) + pmd_idx; +} + +pte_t * __init populate_extra_pte(unsigned long vaddr) +{ + int pte_idx = pte_index(vaddr); + pmd_t *pmd; + + pmd = populate_extra_pmd(vaddr); + return one_page_table_init(pmd) + pte_idx; +} + +static unsigned long __init +page_table_range_init_count(unsigned long start, unsigned long end) +{ + unsigned long count = 0; +#ifdef CONFIG_HIGHMEM + int pmd_idx_kmap_begin = fix_to_virt(FIX_KMAP_END) >> PMD_SHIFT; + int pmd_idx_kmap_end = fix_to_virt(FIX_KMAP_BEGIN) >> PMD_SHIFT; + int pgd_idx, pmd_idx; + unsigned long vaddr; + + if (pmd_idx_kmap_begin == pmd_idx_kmap_end) + return 0; + + vaddr = start; + pgd_idx = pgd_index(vaddr); + pmd_idx = pmd_index(vaddr); + + for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd_idx++) { + for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); + pmd_idx++) { + if ((vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin && + (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end) + count++; + vaddr += PMD_SIZE; + } + pmd_idx = 0; + } +#endif + return count; +} + +static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd, + unsigned long vaddr, pte_t *lastpte, + void **adr) +{ +#ifdef CONFIG_HIGHMEM + /* + * Something (early fixmap) may already have put a pte + * page here, which causes the page table allocation + * to become nonlinear. Attempt to fix it, and if it + * is still nonlinear then we have to bug. + */ + int pmd_idx_kmap_begin = fix_to_virt(FIX_KMAP_END) >> PMD_SHIFT; + int pmd_idx_kmap_end = fix_to_virt(FIX_KMAP_BEGIN) >> PMD_SHIFT; + + if (pmd_idx_kmap_begin != pmd_idx_kmap_end + && (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin + && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end) { + pte_t *newpte; + int i; + + BUG_ON(after_bootmem); + newpte = *adr; + for (i = 0; i < PTRS_PER_PTE; i++) + set_pte(newpte + i, pte[i]); + *adr = (void *)(((unsigned long)(*adr)) + PAGE_SIZE); + + set_pmd(pmd, __pmd(__pa(newpte)|_PAGE_TABLE)); + BUG_ON(newpte != pte_offset_kernel(pmd, 0)); + __flush_tlb_all(); + + pte = newpte; + } + BUG_ON(vaddr < fix_to_virt(FIX_KMAP_BEGIN - 1) + && vaddr > fix_to_virt(FIX_KMAP_END) + && lastpte && lastpte + PTRS_PER_PTE != pte); +#endif + return pte; +} + +/* + * This function initializes a certain range of kernel virtual memory + * with new bootmem page tables, everywhere page tables are missing in + * the given range. + * + * NOTE: The pagetables are allocated contiguous on the physical space + * so we can cache the place of the first one and move around without + * checking the pgd every time. + */ +static void __init +page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base) +{ + int pgd_idx, pmd_idx; + unsigned long vaddr; + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte = NULL; + unsigned long count = page_table_range_init_count(start, end); + void *adr = NULL; + + if (count) + adr = alloc_low_pages(count); + + vaddr = start; + pgd_idx = pgd_index(vaddr); + pmd_idx = pmd_index(vaddr); + pgd = pgd_base + pgd_idx; + + for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) { + pmd = one_md_table_init(pgd); + pmd = pmd + pmd_index(vaddr); + for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); + pmd++, pmd_idx++) { + pte = page_table_kmap_check(one_page_table_init(pmd), + pmd, vaddr, pte, &adr); + + vaddr += PMD_SIZE; + } + pmd_idx = 0; + } +} + +static inline int is_x86_32_kernel_text(unsigned long addr) +{ + if (addr >= (unsigned long)_text && addr <= (unsigned long)__init_end) + return 1; + return 0; +} + +/* + * This maps the physical memory to kernel virtual address space, a total + * of max_low_pfn pages, by creating page tables starting from address + * PAGE_OFFSET: + */ +unsigned long __init +kernel_physical_mapping_init(unsigned long start, + unsigned long end, + unsigned long page_size_mask, + pgprot_t prot) +{ + int use_pse = page_size_mask == (1<<PG_LEVEL_2M); + unsigned long last_map_addr = end; + unsigned long start_pfn, end_pfn; + pgd_t *pgd_base = swapper_pg_dir; + int pgd_idx, pmd_idx, pte_ofs; + unsigned long pfn; + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte; + unsigned pages_2m, pages_4k; + int mapping_iter; + + start_pfn = start >> PAGE_SHIFT; + end_pfn = end >> PAGE_SHIFT; + + /* + * First iteration will setup identity mapping using large/small pages + * based on use_pse, with other attributes same as set by + * the early code in head_32.S + * + * Second iteration will setup the appropriate attributes (NX, GLOBAL..) + * as desired for the kernel identity mapping. + * + * This two pass mechanism conforms to the TLB app note which says: + * + * "Software should not write to a paging-structure entry in a way + * that would change, for any linear address, both the page size + * and either the page frame or attributes." + */ + mapping_iter = 1; + + if (!boot_cpu_has(X86_FEATURE_PSE)) + use_pse = 0; + +repeat: + pages_2m = pages_4k = 0; + pfn = start_pfn; + pgd_idx = pgd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET); + pgd = pgd_base + pgd_idx; + for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) { + pmd = one_md_table_init(pgd); + + if (pfn >= end_pfn) + continue; +#ifdef CONFIG_X86_PAE + pmd_idx = pmd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET); + pmd += pmd_idx; +#else + pmd_idx = 0; +#endif + for (; pmd_idx < PTRS_PER_PMD && pfn < end_pfn; + pmd++, pmd_idx++) { + unsigned int addr = pfn * PAGE_SIZE + PAGE_OFFSET; + + /* + * Map with big pages if possible, otherwise + * create normal page tables: + */ + if (use_pse) { + unsigned int addr2; + pgprot_t prot = PAGE_KERNEL_LARGE; + /* + * first pass will use the same initial + * identity mapping attribute + _PAGE_PSE. + */ + pgprot_t init_prot = + __pgprot(PTE_IDENT_ATTR | + _PAGE_PSE); + + pfn &= PMD_MASK >> PAGE_SHIFT; + addr2 = (pfn + PTRS_PER_PTE-1) * PAGE_SIZE + + PAGE_OFFSET + PAGE_SIZE-1; + + if (is_x86_32_kernel_text(addr) || + is_x86_32_kernel_text(addr2)) + prot = PAGE_KERNEL_LARGE_EXEC; + + pages_2m++; + if (mapping_iter == 1) + set_pmd(pmd, pfn_pmd(pfn, init_prot)); + else + set_pmd(pmd, pfn_pmd(pfn, prot)); + + pfn += PTRS_PER_PTE; + continue; + } + pte = one_page_table_init(pmd); + + pte_ofs = pte_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET); + pte += pte_ofs; + for (; pte_ofs < PTRS_PER_PTE && pfn < end_pfn; + pte++, pfn++, pte_ofs++, addr += PAGE_SIZE) { + pgprot_t prot = PAGE_KERNEL; + /* + * first pass will use the same initial + * identity mapping attribute. + */ + pgprot_t init_prot = __pgprot(PTE_IDENT_ATTR); + + if (is_x86_32_kernel_text(addr)) + prot = PAGE_KERNEL_EXEC; + + pages_4k++; + if (mapping_iter == 1) { + set_pte(pte, pfn_pte(pfn, init_prot)); + last_map_addr = (pfn << PAGE_SHIFT) + PAGE_SIZE; + } else + set_pte(pte, pfn_pte(pfn, prot)); + } + } + } + if (mapping_iter == 1) { + /* + * update direct mapping page count only in the first + * iteration. + */ + update_page_count(PG_LEVEL_2M, pages_2m); + update_page_count(PG_LEVEL_4K, pages_4k); + + /* + * local global flush tlb, which will flush the previous + * mappings present in both small and large page TLB's. + */ + __flush_tlb_all(); + + /* + * Second iteration will set the actual desired PTE attributes. + */ + mapping_iter = 2; + goto repeat; + } + return last_map_addr; +} + +#ifdef CONFIG_HIGHMEM +static void __init permanent_kmaps_init(pgd_t *pgd_base) +{ + unsigned long vaddr = PKMAP_BASE; + + page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base); + + pkmap_page_table = virt_to_kpte(vaddr); +} + +void __init add_highpages_with_active_regions(int nid, + unsigned long start_pfn, unsigned long end_pfn) +{ + phys_addr_t start, end; + u64 i; + + for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &start, &end, NULL) { + unsigned long pfn = clamp_t(unsigned long, PFN_UP(start), + start_pfn, end_pfn); + unsigned long e_pfn = clamp_t(unsigned long, PFN_DOWN(end), + start_pfn, end_pfn); + for ( ; pfn < e_pfn; pfn++) + if (pfn_valid(pfn)) + free_highmem_page(pfn_to_page(pfn)); + } +} +#else +static inline void permanent_kmaps_init(pgd_t *pgd_base) +{ +} +#endif /* CONFIG_HIGHMEM */ + +void __init sync_initial_page_table(void) +{ + clone_pgd_range(initial_page_table + KERNEL_PGD_BOUNDARY, + swapper_pg_dir + KERNEL_PGD_BOUNDARY, + KERNEL_PGD_PTRS); + + /* + * sync back low identity map too. It is used for example + * in the 32-bit EFI stub. + */ + clone_pgd_range(initial_page_table, + swapper_pg_dir + KERNEL_PGD_BOUNDARY, + min(KERNEL_PGD_PTRS, KERNEL_PGD_BOUNDARY)); +} + +void __init native_pagetable_init(void) +{ + unsigned long pfn, va; + pgd_t *pgd, *base = swapper_pg_dir; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + /* + * Remove any mappings which extend past the end of physical + * memory from the boot time page table. + * In virtual address space, we should have at least two pages + * from VMALLOC_END to pkmap or fixmap according to VMALLOC_END + * definition. And max_low_pfn is set to VMALLOC_END physical + * address. If initial memory mapping is doing right job, we + * should have pte used near max_low_pfn or one pmd is not present. + */ + for (pfn = max_low_pfn; pfn < 1<<(32-PAGE_SHIFT); pfn++) { + va = PAGE_OFFSET + (pfn<<PAGE_SHIFT); + pgd = base + pgd_index(va); + if (!pgd_present(*pgd)) + break; + + p4d = p4d_offset(pgd, va); + pud = pud_offset(p4d, va); + pmd = pmd_offset(pud, va); + if (!pmd_present(*pmd)) + break; + + /* should not be large page here */ + if (pmd_large(*pmd)) { + pr_warn("try to clear pte for ram above max_low_pfn: pfn: %lx pmd: %p pmd phys: %lx, but pmd is big page and is not using pte !\n", + pfn, pmd, __pa(pmd)); + BUG_ON(1); + } + + pte = pte_offset_kernel(pmd, va); + if (!pte_present(*pte)) + break; + + printk(KERN_DEBUG "clearing pte for ram above max_low_pfn: pfn: %lx pmd: %p pmd phys: %lx pte: %p pte phys: %lx\n", + pfn, pmd, __pa(pmd), pte, __pa(pte)); + pte_clear(NULL, va, pte); + } + paging_init(); +} + +/* + * Build a proper pagetable for the kernel mappings. Up until this + * point, we've been running on some set of pagetables constructed by + * the boot process. + * + * This will be a pagetable constructed in arch/x86/kernel/head_32.S. + * The root of the pagetable will be swapper_pg_dir. + * + * In general, pagetable_init() assumes that the pagetable may already + * be partially populated, and so it avoids stomping on any existing + * mappings. + */ +void __init early_ioremap_page_table_range_init(void) +{ + pgd_t *pgd_base = swapper_pg_dir; + unsigned long vaddr, end; + + /* + * Fixed mappings, only the page table structure has to be + * created - mappings will be set by set_fixmap(): + */ + vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK; + end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK; + page_table_range_init(vaddr, end, pgd_base); + early_ioremap_reset(); +} + +static void __init pagetable_init(void) +{ + pgd_t *pgd_base = swapper_pg_dir; + + permanent_kmaps_init(pgd_base); +} + +#define DEFAULT_PTE_MASK ~(_PAGE_NX | _PAGE_GLOBAL) +/* Bits supported by the hardware: */ +pteval_t __supported_pte_mask __read_mostly = DEFAULT_PTE_MASK; +/* Bits allowed in normal kernel mappings: */ +pteval_t __default_kernel_pte_mask __read_mostly = DEFAULT_PTE_MASK; +EXPORT_SYMBOL_GPL(__supported_pte_mask); +/* Used in PAGE_KERNEL_* macros which are reasonably used out-of-tree: */ +EXPORT_SYMBOL(__default_kernel_pte_mask); + +/* user-defined highmem size */ +static unsigned int highmem_pages = -1; + +/* + * highmem=size forces highmem to be exactly 'size' bytes. + * This works even on boxes that have no highmem otherwise. + * This also works to reduce highmem size on bigger boxes. + */ +static int __init parse_highmem(char *arg) +{ + if (!arg) + return -EINVAL; + + highmem_pages = memparse(arg, &arg) >> PAGE_SHIFT; + return 0; +} +early_param("highmem", parse_highmem); + +#define MSG_HIGHMEM_TOO_BIG \ + "highmem size (%luMB) is bigger than pages available (%luMB)!\n" + +#define MSG_LOWMEM_TOO_SMALL \ + "highmem size (%luMB) results in <64MB lowmem, ignoring it!\n" +/* + * All of RAM fits into lowmem - but if user wants highmem + * artificially via the highmem=x boot parameter then create + * it: + */ +static void __init lowmem_pfn_init(void) +{ + /* max_low_pfn is 0, we already have early_res support */ + max_low_pfn = max_pfn; + + if (highmem_pages == -1) + highmem_pages = 0; +#ifdef CONFIG_HIGHMEM + if (highmem_pages >= max_pfn) { + printk(KERN_ERR MSG_HIGHMEM_TOO_BIG, + pages_to_mb(highmem_pages), pages_to_mb(max_pfn)); + highmem_pages = 0; + } + if (highmem_pages) { + if (max_low_pfn - highmem_pages < 64*1024*1024/PAGE_SIZE) { + printk(KERN_ERR MSG_LOWMEM_TOO_SMALL, + pages_to_mb(highmem_pages)); + highmem_pages = 0; + } + max_low_pfn -= highmem_pages; + } +#else + if (highmem_pages) + printk(KERN_ERR "ignoring highmem size on non-highmem kernel!\n"); +#endif +} + +#define MSG_HIGHMEM_TOO_SMALL \ + "only %luMB highmem pages available, ignoring highmem size of %luMB!\n" + +#define MSG_HIGHMEM_TRIMMED \ + "Warning: only 4GB will be used. Use a HIGHMEM64G enabled kernel!\n" +/* + * We have more RAM than fits into lowmem - we try to put it into + * highmem, also taking the highmem=x boot parameter into account: + */ +static void __init highmem_pfn_init(void) +{ + max_low_pfn = MAXMEM_PFN; + + if (highmem_pages == -1) + highmem_pages = max_pfn - MAXMEM_PFN; + + if (highmem_pages + MAXMEM_PFN < max_pfn) + max_pfn = MAXMEM_PFN + highmem_pages; + + if (highmem_pages + MAXMEM_PFN > max_pfn) { + printk(KERN_WARNING MSG_HIGHMEM_TOO_SMALL, + pages_to_mb(max_pfn - MAXMEM_PFN), + pages_to_mb(highmem_pages)); + highmem_pages = 0; + } +#ifndef CONFIG_HIGHMEM + /* Maximum memory usable is what is directly addressable */ + printk(KERN_WARNING "Warning only %ldMB will be used.\n", MAXMEM>>20); + if (max_pfn > MAX_NONPAE_PFN) + printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n"); + else + printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n"); + max_pfn = MAXMEM_PFN; +#else /* !CONFIG_HIGHMEM */ +#ifndef CONFIG_HIGHMEM64G + if (max_pfn > MAX_NONPAE_PFN) { + max_pfn = MAX_NONPAE_PFN; + printk(KERN_WARNING MSG_HIGHMEM_TRIMMED); + } +#endif /* !CONFIG_HIGHMEM64G */ +#endif /* !CONFIG_HIGHMEM */ +} + +/* + * Determine low and high memory ranges: + */ +void __init find_low_pfn_range(void) +{ + /* it could update max_pfn */ + + if (max_pfn <= MAXMEM_PFN) + lowmem_pfn_init(); + else + highmem_pfn_init(); +} + +#ifndef CONFIG_NUMA +void __init initmem_init(void) +{ +#ifdef CONFIG_HIGHMEM + highstart_pfn = highend_pfn = max_pfn; + if (max_pfn > max_low_pfn) + highstart_pfn = max_low_pfn; + printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", + pages_to_mb(highend_pfn - highstart_pfn)); + high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1; +#else + high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1; +#endif + + memblock_set_node(0, PHYS_ADDR_MAX, &memblock.memory, 0); + +#ifdef CONFIG_FLATMEM + max_mapnr = IS_ENABLED(CONFIG_HIGHMEM) ? highend_pfn : max_low_pfn; +#endif + __vmalloc_start_set = true; + + printk(KERN_NOTICE "%ldMB LOWMEM available.\n", + pages_to_mb(max_low_pfn)); + + setup_bootmem_allocator(); +} +#endif /* !CONFIG_NUMA */ + +void __init setup_bootmem_allocator(void) +{ + printk(KERN_INFO " mapped low ram: 0 - %08lx\n", + max_pfn_mapped<<PAGE_SHIFT); + printk(KERN_INFO " low ram: 0 - %08lx\n", max_low_pfn<<PAGE_SHIFT); +} + +/* + * paging_init() sets up the page tables - note that the first 8MB are + * already mapped by head.S. + * + * This routines also unmaps the page at virtual kernel address 0, so + * that we can trap those pesky NULL-reference errors in the kernel. + */ +void __init paging_init(void) +{ + pagetable_init(); + + __flush_tlb_all(); + + /* + * NOTE: at this point the bootmem allocator is fully available. + */ + olpc_dt_build_devicetree(); + sparse_init(); + zone_sizes_init(); +} + +/* + * Test if the WP bit works in supervisor mode. It isn't supported on 386's + * and also on some strange 486's. All 586+'s are OK. This used to involve + * black magic jumps to work around some nasty CPU bugs, but fortunately the + * switch to using exceptions got rid of all that. + */ +static void __init test_wp_bit(void) +{ + char z = 0; + + printk(KERN_INFO "Checking if this processor honours the WP bit even in supervisor mode..."); + + __set_fixmap(FIX_WP_TEST, __pa_symbol(empty_zero_page), PAGE_KERNEL_RO); + + if (copy_to_kernel_nofault((char *)fix_to_virt(FIX_WP_TEST), &z, 1)) { + clear_fixmap(FIX_WP_TEST); + printk(KERN_CONT "Ok.\n"); + return; + } + + printk(KERN_CONT "No.\n"); + panic("Linux doesn't support CPUs with broken WP."); +} + +void __init mem_init(void) +{ + pci_iommu_alloc(); + +#ifdef CONFIG_FLATMEM + BUG_ON(!mem_map); +#endif + /* + * With CONFIG_DEBUG_PAGEALLOC initialization of highmem pages has to + * be done before memblock_free_all(). Memblock use free low memory for + * temporary data (see find_range_array()) and for this purpose can use + * pages that was already passed to the buddy allocator, hence marked as + * not accessible in the page tables when compiled with + * CONFIG_DEBUG_PAGEALLOC. Otherwise order of initialization is not + * important here. + */ + set_highmem_pages_init(); + + /* this will put all low memory onto the freelists */ + memblock_free_all(); + + after_bootmem = 1; + x86_init.hyper.init_after_bootmem(); + + /* + * Check boundaries twice: Some fundamental inconsistencies can + * be detected at build time already. + */ +#define __FIXADDR_TOP (-PAGE_SIZE) +#ifdef CONFIG_HIGHMEM + BUILD_BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE > FIXADDR_START); + BUILD_BUG_ON(VMALLOC_END > PKMAP_BASE); +#endif +#define high_memory (-128UL << 20) + BUILD_BUG_ON(VMALLOC_START >= VMALLOC_END); +#undef high_memory +#undef __FIXADDR_TOP + +#ifdef CONFIG_HIGHMEM + BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE > FIXADDR_START); + BUG_ON(VMALLOC_END > PKMAP_BASE); +#endif + BUG_ON(VMALLOC_START >= VMALLOC_END); + BUG_ON((unsigned long)high_memory > VMALLOC_START); + + test_wp_bit(); +} + +int kernel_set_to_readonly __read_mostly; + +static void mark_nxdata_nx(void) +{ + /* + * When this called, init has already been executed and released, + * so everything past _etext should be NX. + */ + unsigned long start = PFN_ALIGN(_etext); + /* + * This comes from is_x86_32_kernel_text upper limit. Also HPAGE where used: + */ + unsigned long size = (((unsigned long)__init_end + HPAGE_SIZE) & HPAGE_MASK) - start; + + if (__supported_pte_mask & _PAGE_NX) + printk(KERN_INFO "NX-protecting the kernel data: %luk\n", size >> 10); + set_memory_nx(start, size >> PAGE_SHIFT); +} + +void mark_rodata_ro(void) +{ + unsigned long start = PFN_ALIGN(_text); + unsigned long size = (unsigned long)__end_rodata - start; + + set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); + pr_info("Write protecting kernel text and read-only data: %luk\n", + size >> 10); + + kernel_set_to_readonly = 1; + +#ifdef CONFIG_CPA_DEBUG + pr_info("Testing CPA: Reverting %lx-%lx\n", start, start + size); + set_pages_rw(virt_to_page(start), size >> PAGE_SHIFT); + + pr_info("Testing CPA: write protecting again\n"); + set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); +#endif + mark_nxdata_nx(); + if (__supported_pte_mask & _PAGE_NX) + debug_checkwx(); +} diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c new file mode 100644 index 0000000000..a190aae8ce --- /dev/null +++ b/arch/x86/mm/init_64.c @@ -0,0 +1,1636 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * linux/arch/x86_64/mm/init.c + * + * Copyright (C) 1995 Linus Torvalds + * Copyright (C) 2000 Pavel Machek <pavel@ucw.cz> + * Copyright (C) 2002,2003 Andi Kleen <ak@suse.de> + */ + +#include <linux/signal.h> +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/string.h> +#include <linux/types.h> +#include <linux/ptrace.h> +#include <linux/mman.h> +#include <linux/mm.h> +#include <linux/swap.h> +#include <linux/smp.h> +#include <linux/init.h> +#include <linux/initrd.h> +#include <linux/pagemap.h> +#include <linux/memblock.h> +#include <linux/proc_fs.h> +#include <linux/pci.h> +#include <linux/pfn.h> +#include <linux/poison.h> +#include <linux/dma-mapping.h> +#include <linux/memory.h> +#include <linux/memory_hotplug.h> +#include <linux/memremap.h> +#include <linux/nmi.h> +#include <linux/gfp.h> +#include <linux/kcore.h> +#include <linux/bootmem_info.h> + +#include <asm/processor.h> +#include <asm/bios_ebda.h> +#include <linux/uaccess.h> +#include <asm/pgalloc.h> +#include <asm/dma.h> +#include <asm/fixmap.h> +#include <asm/e820/api.h> +#include <asm/apic.h> +#include <asm/tlb.h> +#include <asm/mmu_context.h> +#include <asm/proto.h> +#include <asm/smp.h> +#include <asm/sections.h> +#include <asm/kdebug.h> +#include <asm/numa.h> +#include <asm/set_memory.h> +#include <asm/init.h> +#include <asm/uv/uv.h> +#include <asm/setup.h> +#include <asm/ftrace.h> + +#include "mm_internal.h" + +#include "ident_map.c" + +#define DEFINE_POPULATE(fname, type1, type2, init) \ +static inline void fname##_init(struct mm_struct *mm, \ + type1##_t *arg1, type2##_t *arg2, bool init) \ +{ \ + if (init) \ + fname##_safe(mm, arg1, arg2); \ + else \ + fname(mm, arg1, arg2); \ +} + +DEFINE_POPULATE(p4d_populate, p4d, pud, init) +DEFINE_POPULATE(pgd_populate, pgd, p4d, init) +DEFINE_POPULATE(pud_populate, pud, pmd, init) +DEFINE_POPULATE(pmd_populate_kernel, pmd, pte, init) + +#define DEFINE_ENTRY(type1, type2, init) \ +static inline void set_##type1##_init(type1##_t *arg1, \ + type2##_t arg2, bool init) \ +{ \ + if (init) \ + set_##type1##_safe(arg1, arg2); \ + else \ + set_##type1(arg1, arg2); \ +} + +DEFINE_ENTRY(p4d, p4d, init) +DEFINE_ENTRY(pud, pud, init) +DEFINE_ENTRY(pmd, pmd, init) +DEFINE_ENTRY(pte, pte, init) + +static inline pgprot_t prot_sethuge(pgprot_t prot) +{ + WARN_ON_ONCE(pgprot_val(prot) & _PAGE_PAT); + + return __pgprot(pgprot_val(prot) | _PAGE_PSE); +} + +/* + * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the + * physical space so we can cache the place of the first one and move + * around without checking the pgd every time. + */ + +/* Bits supported by the hardware: */ +pteval_t __supported_pte_mask __read_mostly = ~0; +/* Bits allowed in normal kernel mappings: */ +pteval_t __default_kernel_pte_mask __read_mostly = ~0; +EXPORT_SYMBOL_GPL(__supported_pte_mask); +/* Used in PAGE_KERNEL_* macros which are reasonably used out-of-tree: */ +EXPORT_SYMBOL(__default_kernel_pte_mask); + +int force_personality32; + +/* + * noexec32=on|off + * Control non executable heap for 32bit processes. + * + * on PROT_READ does not imply PROT_EXEC for 32-bit processes (default) + * off PROT_READ implies PROT_EXEC + */ +static int __init nonx32_setup(char *str) +{ + if (!strcmp(str, "on")) + force_personality32 &= ~READ_IMPLIES_EXEC; + else if (!strcmp(str, "off")) + force_personality32 |= READ_IMPLIES_EXEC; + return 1; +} +__setup("noexec32=", nonx32_setup); + +static void sync_global_pgds_l5(unsigned long start, unsigned long end) +{ + unsigned long addr; + + for (addr = start; addr <= end; addr = ALIGN(addr + 1, PGDIR_SIZE)) { + const pgd_t *pgd_ref = pgd_offset_k(addr); + struct page *page; + + /* Check for overflow */ + if (addr < start) + break; + + if (pgd_none(*pgd_ref)) + continue; + + spin_lock(&pgd_lock); + list_for_each_entry(page, &pgd_list, lru) { + pgd_t *pgd; + spinlock_t *pgt_lock; + + pgd = (pgd_t *)page_address(page) + pgd_index(addr); + /* the pgt_lock only for Xen */ + pgt_lock = &pgd_page_get_mm(page)->page_table_lock; + spin_lock(pgt_lock); + + if (!pgd_none(*pgd_ref) && !pgd_none(*pgd)) + BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); + + if (pgd_none(*pgd)) + set_pgd(pgd, *pgd_ref); + + spin_unlock(pgt_lock); + } + spin_unlock(&pgd_lock); + } +} + +static void sync_global_pgds_l4(unsigned long start, unsigned long end) +{ + unsigned long addr; + + for (addr = start; addr <= end; addr = ALIGN(addr + 1, PGDIR_SIZE)) { + pgd_t *pgd_ref = pgd_offset_k(addr); + const p4d_t *p4d_ref; + struct page *page; + + /* + * With folded p4d, pgd_none() is always false, we need to + * handle synchronization on p4d level. + */ + MAYBE_BUILD_BUG_ON(pgd_none(*pgd_ref)); + p4d_ref = p4d_offset(pgd_ref, addr); + + if (p4d_none(*p4d_ref)) + continue; + + spin_lock(&pgd_lock); + list_for_each_entry(page, &pgd_list, lru) { + pgd_t *pgd; + p4d_t *p4d; + spinlock_t *pgt_lock; + + pgd = (pgd_t *)page_address(page) + pgd_index(addr); + p4d = p4d_offset(pgd, addr); + /* the pgt_lock only for Xen */ + pgt_lock = &pgd_page_get_mm(page)->page_table_lock; + spin_lock(pgt_lock); + + if (!p4d_none(*p4d_ref) && !p4d_none(*p4d)) + BUG_ON(p4d_pgtable(*p4d) + != p4d_pgtable(*p4d_ref)); + + if (p4d_none(*p4d)) + set_p4d(p4d, *p4d_ref); + + spin_unlock(pgt_lock); + } + spin_unlock(&pgd_lock); + } +} + +/* + * When memory was added make sure all the processes MM have + * suitable PGD entries in the local PGD level page. + */ +static void sync_global_pgds(unsigned long start, unsigned long end) +{ + if (pgtable_l5_enabled()) + sync_global_pgds_l5(start, end); + else + sync_global_pgds_l4(start, end); +} + +/* + * NOTE: This function is marked __ref because it calls __init function + * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0. + */ +static __ref void *spp_getpage(void) +{ + void *ptr; + + if (after_bootmem) + ptr = (void *) get_zeroed_page(GFP_ATOMIC); + else + ptr = memblock_alloc(PAGE_SIZE, PAGE_SIZE); + + if (!ptr || ((unsigned long)ptr & ~PAGE_MASK)) { + panic("set_pte_phys: cannot allocate page data %s\n", + after_bootmem ? "after bootmem" : ""); + } + + pr_debug("spp_getpage %p\n", ptr); + + return ptr; +} + +static p4d_t *fill_p4d(pgd_t *pgd, unsigned long vaddr) +{ + if (pgd_none(*pgd)) { + p4d_t *p4d = (p4d_t *)spp_getpage(); + pgd_populate(&init_mm, pgd, p4d); + if (p4d != p4d_offset(pgd, 0)) + printk(KERN_ERR "PAGETABLE BUG #00! %p <-> %p\n", + p4d, p4d_offset(pgd, 0)); + } + return p4d_offset(pgd, vaddr); +} + +static pud_t *fill_pud(p4d_t *p4d, unsigned long vaddr) +{ + if (p4d_none(*p4d)) { + pud_t *pud = (pud_t *)spp_getpage(); + p4d_populate(&init_mm, p4d, pud); + if (pud != pud_offset(p4d, 0)) + printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n", + pud, pud_offset(p4d, 0)); + } + return pud_offset(p4d, vaddr); +} + +static pmd_t *fill_pmd(pud_t *pud, unsigned long vaddr) +{ + if (pud_none(*pud)) { + pmd_t *pmd = (pmd_t *) spp_getpage(); + pud_populate(&init_mm, pud, pmd); + if (pmd != pmd_offset(pud, 0)) + printk(KERN_ERR "PAGETABLE BUG #02! %p <-> %p\n", + pmd, pmd_offset(pud, 0)); + } + return pmd_offset(pud, vaddr); +} + +static pte_t *fill_pte(pmd_t *pmd, unsigned long vaddr) +{ + if (pmd_none(*pmd)) { + pte_t *pte = (pte_t *) spp_getpage(); + pmd_populate_kernel(&init_mm, pmd, pte); + if (pte != pte_offset_kernel(pmd, 0)) + printk(KERN_ERR "PAGETABLE BUG #03!\n"); + } + return pte_offset_kernel(pmd, vaddr); +} + +static void __set_pte_vaddr(pud_t *pud, unsigned long vaddr, pte_t new_pte) +{ + pmd_t *pmd = fill_pmd(pud, vaddr); + pte_t *pte = fill_pte(pmd, vaddr); + + set_pte(pte, new_pte); + + /* + * It's enough to flush this one mapping. + * (PGE mappings get flushed as well) + */ + flush_tlb_one_kernel(vaddr); +} + +void set_pte_vaddr_p4d(p4d_t *p4d_page, unsigned long vaddr, pte_t new_pte) +{ + p4d_t *p4d = p4d_page + p4d_index(vaddr); + pud_t *pud = fill_pud(p4d, vaddr); + + __set_pte_vaddr(pud, vaddr, new_pte); +} + +void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte) +{ + pud_t *pud = pud_page + pud_index(vaddr); + + __set_pte_vaddr(pud, vaddr, new_pte); +} + +void set_pte_vaddr(unsigned long vaddr, pte_t pteval) +{ + pgd_t *pgd; + p4d_t *p4d_page; + + pr_debug("set_pte_vaddr %lx to %lx\n", vaddr, native_pte_val(pteval)); + + pgd = pgd_offset_k(vaddr); + if (pgd_none(*pgd)) { + printk(KERN_ERR + "PGD FIXMAP MISSING, it should be setup in head.S!\n"); + return; + } + + p4d_page = p4d_offset(pgd, 0); + set_pte_vaddr_p4d(p4d_page, vaddr, pteval); +} + +pmd_t * __init populate_extra_pmd(unsigned long vaddr) +{ + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + + pgd = pgd_offset_k(vaddr); + p4d = fill_p4d(pgd, vaddr); + pud = fill_pud(p4d, vaddr); + return fill_pmd(pud, vaddr); +} + +pte_t * __init populate_extra_pte(unsigned long vaddr) +{ + pmd_t *pmd; + + pmd = populate_extra_pmd(vaddr); + return fill_pte(pmd, vaddr); +} + +/* + * Create large page table mappings for a range of physical addresses. + */ +static void __init __init_extra_mapping(unsigned long phys, unsigned long size, + enum page_cache_mode cache) +{ + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + pgprot_t prot; + + pgprot_val(prot) = pgprot_val(PAGE_KERNEL_LARGE) | + protval_4k_2_large(cachemode2protval(cache)); + BUG_ON((phys & ~PMD_MASK) || (size & ~PMD_MASK)); + for (; size; phys += PMD_SIZE, size -= PMD_SIZE) { + pgd = pgd_offset_k((unsigned long)__va(phys)); + if (pgd_none(*pgd)) { + p4d = (p4d_t *) spp_getpage(); + set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE | + _PAGE_USER)); + } + p4d = p4d_offset(pgd, (unsigned long)__va(phys)); + if (p4d_none(*p4d)) { + pud = (pud_t *) spp_getpage(); + set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE | + _PAGE_USER)); + } + pud = pud_offset(p4d, (unsigned long)__va(phys)); + if (pud_none(*pud)) { + pmd = (pmd_t *) spp_getpage(); + set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | + _PAGE_USER)); + } + pmd = pmd_offset(pud, phys); + BUG_ON(!pmd_none(*pmd)); + set_pmd(pmd, __pmd(phys | pgprot_val(prot))); + } +} + +void __init init_extra_mapping_wb(unsigned long phys, unsigned long size) +{ + __init_extra_mapping(phys, size, _PAGE_CACHE_MODE_WB); +} + +void __init init_extra_mapping_uc(unsigned long phys, unsigned long size) +{ + __init_extra_mapping(phys, size, _PAGE_CACHE_MODE_UC); +} + +/* + * The head.S code sets up the kernel high mapping: + * + * from __START_KERNEL_map to __START_KERNEL_map + size (== _end-_text) + * + * phys_base holds the negative offset to the kernel, which is added + * to the compile time generated pmds. This results in invalid pmds up + * to the point where we hit the physaddr 0 mapping. + * + * We limit the mappings to the region from _text to _brk_end. _brk_end + * is rounded up to the 2MB boundary. This catches the invalid pmds as + * well, as they are located before _text: + */ +void __init cleanup_highmap(void) +{ + unsigned long vaddr = __START_KERNEL_map; + unsigned long vaddr_end = __START_KERNEL_map + KERNEL_IMAGE_SIZE; + unsigned long end = roundup((unsigned long)_brk_end, PMD_SIZE) - 1; + pmd_t *pmd = level2_kernel_pgt; + + /* + * Native path, max_pfn_mapped is not set yet. + * Xen has valid max_pfn_mapped set in + * arch/x86/xen/mmu.c:xen_setup_kernel_pagetable(). + */ + if (max_pfn_mapped) + vaddr_end = __START_KERNEL_map + (max_pfn_mapped << PAGE_SHIFT); + + for (; vaddr + PMD_SIZE - 1 < vaddr_end; pmd++, vaddr += PMD_SIZE) { + if (pmd_none(*pmd)) + continue; + if (vaddr < (unsigned long) _text || vaddr > end) + set_pmd(pmd, __pmd(0)); + } +} + +/* + * Create PTE level page table mapping for physical addresses. + * It returns the last physical address mapped. + */ +static unsigned long __meminit +phys_pte_init(pte_t *pte_page, unsigned long paddr, unsigned long paddr_end, + pgprot_t prot, bool init) +{ + unsigned long pages = 0, paddr_next; + unsigned long paddr_last = paddr_end; + pte_t *pte; + int i; + + pte = pte_page + pte_index(paddr); + i = pte_index(paddr); + + for (; i < PTRS_PER_PTE; i++, paddr = paddr_next, pte++) { + paddr_next = (paddr & PAGE_MASK) + PAGE_SIZE; + if (paddr >= paddr_end) { + if (!after_bootmem && + !e820__mapped_any(paddr & PAGE_MASK, paddr_next, + E820_TYPE_RAM) && + !e820__mapped_any(paddr & PAGE_MASK, paddr_next, + E820_TYPE_RESERVED_KERN)) + set_pte_init(pte, __pte(0), init); + continue; + } + + /* + * We will re-use the existing mapping. + * Xen for example has some special requirements, like mapping + * pagetable pages as RO. So assume someone who pre-setup + * these mappings are more intelligent. + */ + if (!pte_none(*pte)) { + if (!after_bootmem) + pages++; + continue; + } + + if (0) + pr_info(" pte=%p addr=%lx pte=%016lx\n", pte, paddr, + pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL).pte); + pages++; + set_pte_init(pte, pfn_pte(paddr >> PAGE_SHIFT, prot), init); + paddr_last = (paddr & PAGE_MASK) + PAGE_SIZE; + } + + update_page_count(PG_LEVEL_4K, pages); + + return paddr_last; +} + +/* + * Create PMD level page table mapping for physical addresses. The virtual + * and physical address have to be aligned at this level. + * It returns the last physical address mapped. + */ +static unsigned long __meminit +phys_pmd_init(pmd_t *pmd_page, unsigned long paddr, unsigned long paddr_end, + unsigned long page_size_mask, pgprot_t prot, bool init) +{ + unsigned long pages = 0, paddr_next; + unsigned long paddr_last = paddr_end; + + int i = pmd_index(paddr); + + for (; i < PTRS_PER_PMD; i++, paddr = paddr_next) { + pmd_t *pmd = pmd_page + pmd_index(paddr); + pte_t *pte; + pgprot_t new_prot = prot; + + paddr_next = (paddr & PMD_MASK) + PMD_SIZE; + if (paddr >= paddr_end) { + if (!after_bootmem && + !e820__mapped_any(paddr & PMD_MASK, paddr_next, + E820_TYPE_RAM) && + !e820__mapped_any(paddr & PMD_MASK, paddr_next, + E820_TYPE_RESERVED_KERN)) + set_pmd_init(pmd, __pmd(0), init); + continue; + } + + if (!pmd_none(*pmd)) { + if (!pmd_large(*pmd)) { + spin_lock(&init_mm.page_table_lock); + pte = (pte_t *)pmd_page_vaddr(*pmd); + paddr_last = phys_pte_init(pte, paddr, + paddr_end, prot, + init); + spin_unlock(&init_mm.page_table_lock); + continue; + } + /* + * If we are ok with PG_LEVEL_2M mapping, then we will + * use the existing mapping, + * + * Otherwise, we will split the large page mapping but + * use the same existing protection bits except for + * large page, so that we don't violate Intel's TLB + * Application note (317080) which says, while changing + * the page sizes, new and old translations should + * not differ with respect to page frame and + * attributes. + */ + if (page_size_mask & (1 << PG_LEVEL_2M)) { + if (!after_bootmem) + pages++; + paddr_last = paddr_next; + continue; + } + new_prot = pte_pgprot(pte_clrhuge(*(pte_t *)pmd)); + } + + if (page_size_mask & (1<<PG_LEVEL_2M)) { + pages++; + spin_lock(&init_mm.page_table_lock); + set_pmd_init(pmd, + pfn_pmd(paddr >> PAGE_SHIFT, prot_sethuge(prot)), + init); + spin_unlock(&init_mm.page_table_lock); + paddr_last = paddr_next; + continue; + } + + pte = alloc_low_page(); + paddr_last = phys_pte_init(pte, paddr, paddr_end, new_prot, init); + + spin_lock(&init_mm.page_table_lock); + pmd_populate_kernel_init(&init_mm, pmd, pte, init); + spin_unlock(&init_mm.page_table_lock); + } + update_page_count(PG_LEVEL_2M, pages); + return paddr_last; +} + +/* + * Create PUD level page table mapping for physical addresses. The virtual + * and physical address do not have to be aligned at this level. KASLR can + * randomize virtual addresses up to this level. + * It returns the last physical address mapped. + */ +static unsigned long __meminit +phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end, + unsigned long page_size_mask, pgprot_t _prot, bool init) +{ + unsigned long pages = 0, paddr_next; + unsigned long paddr_last = paddr_end; + unsigned long vaddr = (unsigned long)__va(paddr); + int i = pud_index(vaddr); + + for (; i < PTRS_PER_PUD; i++, paddr = paddr_next) { + pud_t *pud; + pmd_t *pmd; + pgprot_t prot = _prot; + + vaddr = (unsigned long)__va(paddr); + pud = pud_page + pud_index(vaddr); + paddr_next = (paddr & PUD_MASK) + PUD_SIZE; + + if (paddr >= paddr_end) { + if (!after_bootmem && + !e820__mapped_any(paddr & PUD_MASK, paddr_next, + E820_TYPE_RAM) && + !e820__mapped_any(paddr & PUD_MASK, paddr_next, + E820_TYPE_RESERVED_KERN)) + set_pud_init(pud, __pud(0), init); + continue; + } + + if (!pud_none(*pud)) { + if (!pud_large(*pud)) { + pmd = pmd_offset(pud, 0); + paddr_last = phys_pmd_init(pmd, paddr, + paddr_end, + page_size_mask, + prot, init); + continue; + } + /* + * If we are ok with PG_LEVEL_1G mapping, then we will + * use the existing mapping. + * + * Otherwise, we will split the gbpage mapping but use + * the same existing protection bits except for large + * page, so that we don't violate Intel's TLB + * Application note (317080) which says, while changing + * the page sizes, new and old translations should + * not differ with respect to page frame and + * attributes. + */ + if (page_size_mask & (1 << PG_LEVEL_1G)) { + if (!after_bootmem) + pages++; + paddr_last = paddr_next; + continue; + } + prot = pte_pgprot(pte_clrhuge(*(pte_t *)pud)); + } + + if (page_size_mask & (1<<PG_LEVEL_1G)) { + pages++; + spin_lock(&init_mm.page_table_lock); + set_pud_init(pud, + pfn_pud(paddr >> PAGE_SHIFT, prot_sethuge(prot)), + init); + spin_unlock(&init_mm.page_table_lock); + paddr_last = paddr_next; + continue; + } + + pmd = alloc_low_page(); + paddr_last = phys_pmd_init(pmd, paddr, paddr_end, + page_size_mask, prot, init); + + spin_lock(&init_mm.page_table_lock); + pud_populate_init(&init_mm, pud, pmd, init); + spin_unlock(&init_mm.page_table_lock); + } + + update_page_count(PG_LEVEL_1G, pages); + + return paddr_last; +} + +static unsigned long __meminit +phys_p4d_init(p4d_t *p4d_page, unsigned long paddr, unsigned long paddr_end, + unsigned long page_size_mask, pgprot_t prot, bool init) +{ + unsigned long vaddr, vaddr_end, vaddr_next, paddr_next, paddr_last; + + paddr_last = paddr_end; + vaddr = (unsigned long)__va(paddr); + vaddr_end = (unsigned long)__va(paddr_end); + + if (!pgtable_l5_enabled()) + return phys_pud_init((pud_t *) p4d_page, paddr, paddr_end, + page_size_mask, prot, init); + + for (; vaddr < vaddr_end; vaddr = vaddr_next) { + p4d_t *p4d = p4d_page + p4d_index(vaddr); + pud_t *pud; + + vaddr_next = (vaddr & P4D_MASK) + P4D_SIZE; + paddr = __pa(vaddr); + + if (paddr >= paddr_end) { + paddr_next = __pa(vaddr_next); + if (!after_bootmem && + !e820__mapped_any(paddr & P4D_MASK, paddr_next, + E820_TYPE_RAM) && + !e820__mapped_any(paddr & P4D_MASK, paddr_next, + E820_TYPE_RESERVED_KERN)) + set_p4d_init(p4d, __p4d(0), init); + continue; + } + + if (!p4d_none(*p4d)) { + pud = pud_offset(p4d, 0); + paddr_last = phys_pud_init(pud, paddr, __pa(vaddr_end), + page_size_mask, prot, init); + continue; + } + + pud = alloc_low_page(); + paddr_last = phys_pud_init(pud, paddr, __pa(vaddr_end), + page_size_mask, prot, init); + + spin_lock(&init_mm.page_table_lock); + p4d_populate_init(&init_mm, p4d, pud, init); + spin_unlock(&init_mm.page_table_lock); + } + + return paddr_last; +} + +static unsigned long __meminit +__kernel_physical_mapping_init(unsigned long paddr_start, + unsigned long paddr_end, + unsigned long page_size_mask, + pgprot_t prot, bool init) +{ + bool pgd_changed = false; + unsigned long vaddr, vaddr_start, vaddr_end, vaddr_next, paddr_last; + + paddr_last = paddr_end; + vaddr = (unsigned long)__va(paddr_start); + vaddr_end = (unsigned long)__va(paddr_end); + vaddr_start = vaddr; + + for (; vaddr < vaddr_end; vaddr = vaddr_next) { + pgd_t *pgd = pgd_offset_k(vaddr); + p4d_t *p4d; + + vaddr_next = (vaddr & PGDIR_MASK) + PGDIR_SIZE; + + if (pgd_val(*pgd)) { + p4d = (p4d_t *)pgd_page_vaddr(*pgd); + paddr_last = phys_p4d_init(p4d, __pa(vaddr), + __pa(vaddr_end), + page_size_mask, + prot, init); + continue; + } + + p4d = alloc_low_page(); + paddr_last = phys_p4d_init(p4d, __pa(vaddr), __pa(vaddr_end), + page_size_mask, prot, init); + + spin_lock(&init_mm.page_table_lock); + if (pgtable_l5_enabled()) + pgd_populate_init(&init_mm, pgd, p4d, init); + else + p4d_populate_init(&init_mm, p4d_offset(pgd, vaddr), + (pud_t *) p4d, init); + + spin_unlock(&init_mm.page_table_lock); + pgd_changed = true; + } + + if (pgd_changed) + sync_global_pgds(vaddr_start, vaddr_end - 1); + + return paddr_last; +} + + +/* + * Create page table mapping for the physical memory for specific physical + * addresses. Note that it can only be used to populate non-present entries. + * The virtual and physical addresses have to be aligned on PMD level + * down. It returns the last physical address mapped. + */ +unsigned long __meminit +kernel_physical_mapping_init(unsigned long paddr_start, + unsigned long paddr_end, + unsigned long page_size_mask, pgprot_t prot) +{ + return __kernel_physical_mapping_init(paddr_start, paddr_end, + page_size_mask, prot, true); +} + +/* + * This function is similar to kernel_physical_mapping_init() above with the + * exception that it uses set_{pud,pmd}() instead of the set_{pud,pte}_safe() + * when updating the mapping. The caller is responsible to flush the TLBs after + * the function returns. + */ +unsigned long __meminit +kernel_physical_mapping_change(unsigned long paddr_start, + unsigned long paddr_end, + unsigned long page_size_mask) +{ + return __kernel_physical_mapping_init(paddr_start, paddr_end, + page_size_mask, PAGE_KERNEL, + false); +} + +#ifndef CONFIG_NUMA +void __init initmem_init(void) +{ + memblock_set_node(0, PHYS_ADDR_MAX, &memblock.memory, 0); +} +#endif + +void __init paging_init(void) +{ + sparse_init(); + + /* + * clear the default setting with node 0 + * note: don't use nodes_clear here, that is really clearing when + * numa support is not compiled in, and later node_set_state + * will not set it back. + */ + node_clear_state(0, N_MEMORY); + node_clear_state(0, N_NORMAL_MEMORY); + + zone_sizes_init(); +} + +#ifdef CONFIG_SPARSEMEM_VMEMMAP +#define PAGE_UNUSED 0xFD + +/* + * The unused vmemmap range, which was not yet memset(PAGE_UNUSED), ranges + * from unused_pmd_start to next PMD_SIZE boundary. + */ +static unsigned long unused_pmd_start __meminitdata; + +static void __meminit vmemmap_flush_unused_pmd(void) +{ + if (!unused_pmd_start) + return; + /* + * Clears (unused_pmd_start, PMD_END] + */ + memset((void *)unused_pmd_start, PAGE_UNUSED, + ALIGN(unused_pmd_start, PMD_SIZE) - unused_pmd_start); + unused_pmd_start = 0; +} + +#ifdef CONFIG_MEMORY_HOTPLUG +/* Returns true if the PMD is completely unused and thus it can be freed */ +static bool __meminit vmemmap_pmd_is_unused(unsigned long addr, unsigned long end) +{ + unsigned long start = ALIGN_DOWN(addr, PMD_SIZE); + + /* + * Flush the unused range cache to ensure that memchr_inv() will work + * for the whole range. + */ + vmemmap_flush_unused_pmd(); + memset((void *)addr, PAGE_UNUSED, end - addr); + + return !memchr_inv((void *)start, PAGE_UNUSED, PMD_SIZE); +} +#endif + +static void __meminit __vmemmap_use_sub_pmd(unsigned long start) +{ + /* + * As we expect to add in the same granularity as we remove, it's + * sufficient to mark only some piece used to block the memmap page from + * getting removed when removing some other adjacent memmap (just in + * case the first memmap never gets initialized e.g., because the memory + * block never gets onlined). + */ + memset((void *)start, 0, sizeof(struct page)); +} + +static void __meminit vmemmap_use_sub_pmd(unsigned long start, unsigned long end) +{ + /* + * We only optimize if the new used range directly follows the + * previously unused range (esp., when populating consecutive sections). + */ + if (unused_pmd_start == start) { + if (likely(IS_ALIGNED(end, PMD_SIZE))) + unused_pmd_start = 0; + else + unused_pmd_start = end; + return; + } + + /* + * If the range does not contiguously follows previous one, make sure + * to mark the unused range of the previous one so it can be removed. + */ + vmemmap_flush_unused_pmd(); + __vmemmap_use_sub_pmd(start); +} + + +static void __meminit vmemmap_use_new_sub_pmd(unsigned long start, unsigned long end) +{ + const unsigned long page = ALIGN_DOWN(start, PMD_SIZE); + + vmemmap_flush_unused_pmd(); + + /* + * Could be our memmap page is filled with PAGE_UNUSED already from a + * previous remove. Make sure to reset it. + */ + __vmemmap_use_sub_pmd(start); + + /* + * Mark with PAGE_UNUSED the unused parts of the new memmap range + */ + if (!IS_ALIGNED(start, PMD_SIZE)) + memset((void *)page, PAGE_UNUSED, start - page); + + /* + * We want to avoid memset(PAGE_UNUSED) when populating the vmemmap of + * consecutive sections. Remember for the last added PMD where the + * unused range begins. + */ + if (!IS_ALIGNED(end, PMD_SIZE)) + unused_pmd_start = end; +} +#endif + +/* + * Memory hotplug specific functions + */ +#ifdef CONFIG_MEMORY_HOTPLUG +/* + * After memory hotplug the variables max_pfn, max_low_pfn and high_memory need + * updating. + */ +static void update_end_of_memory_vars(u64 start, u64 size) +{ + unsigned long end_pfn = PFN_UP(start + size); + + if (end_pfn > max_pfn) { + max_pfn = end_pfn; + max_low_pfn = end_pfn; + high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1; + } +} + +int add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages, + struct mhp_params *params) +{ + int ret; + + ret = __add_pages(nid, start_pfn, nr_pages, params); + WARN_ON_ONCE(ret); + + /* update max_pfn, max_low_pfn and high_memory */ + update_end_of_memory_vars(start_pfn << PAGE_SHIFT, + nr_pages << PAGE_SHIFT); + + return ret; +} + +int arch_add_memory(int nid, u64 start, u64 size, + struct mhp_params *params) +{ + unsigned long start_pfn = start >> PAGE_SHIFT; + unsigned long nr_pages = size >> PAGE_SHIFT; + + init_memory_mapping(start, start + size, params->pgprot); + + return add_pages(nid, start_pfn, nr_pages, params); +} + +static void __meminit free_pagetable(struct page *page, int order) +{ + unsigned long magic; + unsigned int nr_pages = 1 << order; + + /* bootmem page has reserved flag */ + if (PageReserved(page)) { + __ClearPageReserved(page); + + magic = page->index; + if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) { + while (nr_pages--) + put_page_bootmem(page++); + } else + while (nr_pages--) + free_reserved_page(page++); + } else + free_pages((unsigned long)page_address(page), order); +} + +static void __meminit free_hugepage_table(struct page *page, + struct vmem_altmap *altmap) +{ + if (altmap) + vmem_altmap_free(altmap, PMD_SIZE / PAGE_SIZE); + else + free_pagetable(page, get_order(PMD_SIZE)); +} + +static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd) +{ + pte_t *pte; + int i; + + for (i = 0; i < PTRS_PER_PTE; i++) { + pte = pte_start + i; + if (!pte_none(*pte)) + return; + } + + /* free a pte talbe */ + free_pagetable(pmd_page(*pmd), 0); + spin_lock(&init_mm.page_table_lock); + pmd_clear(pmd); + spin_unlock(&init_mm.page_table_lock); +} + +static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud) +{ + pmd_t *pmd; + int i; + + for (i = 0; i < PTRS_PER_PMD; i++) { + pmd = pmd_start + i; + if (!pmd_none(*pmd)) + return; + } + + /* free a pmd talbe */ + free_pagetable(pud_page(*pud), 0); + spin_lock(&init_mm.page_table_lock); + pud_clear(pud); + spin_unlock(&init_mm.page_table_lock); +} + +static void __meminit free_pud_table(pud_t *pud_start, p4d_t *p4d) +{ + pud_t *pud; + int i; + + for (i = 0; i < PTRS_PER_PUD; i++) { + pud = pud_start + i; + if (!pud_none(*pud)) + return; + } + + /* free a pud talbe */ + free_pagetable(p4d_page(*p4d), 0); + spin_lock(&init_mm.page_table_lock); + p4d_clear(p4d); + spin_unlock(&init_mm.page_table_lock); +} + +static void __meminit +remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end, + bool direct) +{ + unsigned long next, pages = 0; + pte_t *pte; + phys_addr_t phys_addr; + + pte = pte_start + pte_index(addr); + for (; addr < end; addr = next, pte++) { + next = (addr + PAGE_SIZE) & PAGE_MASK; + if (next > end) + next = end; + + if (!pte_present(*pte)) + continue; + + /* + * We mapped [0,1G) memory as identity mapping when + * initializing, in arch/x86/kernel/head_64.S. These + * pagetables cannot be removed. + */ + phys_addr = pte_val(*pte) + (addr & PAGE_MASK); + if (phys_addr < (phys_addr_t)0x40000000) + return; + + if (!direct) + free_pagetable(pte_page(*pte), 0); + + spin_lock(&init_mm.page_table_lock); + pte_clear(&init_mm, addr, pte); + spin_unlock(&init_mm.page_table_lock); + + /* For non-direct mapping, pages means nothing. */ + pages++; + } + + /* Call free_pte_table() in remove_pmd_table(). */ + flush_tlb_all(); + if (direct) + update_page_count(PG_LEVEL_4K, -pages); +} + +static void __meminit +remove_pmd_table(pmd_t *pmd_start, unsigned long addr, unsigned long end, + bool direct, struct vmem_altmap *altmap) +{ + unsigned long next, pages = 0; + pte_t *pte_base; + pmd_t *pmd; + + pmd = pmd_start + pmd_index(addr); + for (; addr < end; addr = next, pmd++) { + next = pmd_addr_end(addr, end); + + if (!pmd_present(*pmd)) + continue; + + if (pmd_large(*pmd)) { + if (IS_ALIGNED(addr, PMD_SIZE) && + IS_ALIGNED(next, PMD_SIZE)) { + if (!direct) + free_hugepage_table(pmd_page(*pmd), + altmap); + + spin_lock(&init_mm.page_table_lock); + pmd_clear(pmd); + spin_unlock(&init_mm.page_table_lock); + pages++; + } +#ifdef CONFIG_SPARSEMEM_VMEMMAP + else if (vmemmap_pmd_is_unused(addr, next)) { + free_hugepage_table(pmd_page(*pmd), + altmap); + spin_lock(&init_mm.page_table_lock); + pmd_clear(pmd); + spin_unlock(&init_mm.page_table_lock); + } +#endif + continue; + } + + pte_base = (pte_t *)pmd_page_vaddr(*pmd); + remove_pte_table(pte_base, addr, next, direct); + free_pte_table(pte_base, pmd); + } + + /* Call free_pmd_table() in remove_pud_table(). */ + if (direct) + update_page_count(PG_LEVEL_2M, -pages); +} + +static void __meminit +remove_pud_table(pud_t *pud_start, unsigned long addr, unsigned long end, + struct vmem_altmap *altmap, bool direct) +{ + unsigned long next, pages = 0; + pmd_t *pmd_base; + pud_t *pud; + + pud = pud_start + pud_index(addr); + for (; addr < end; addr = next, pud++) { + next = pud_addr_end(addr, end); + + if (!pud_present(*pud)) + continue; + + if (pud_large(*pud) && + IS_ALIGNED(addr, PUD_SIZE) && + IS_ALIGNED(next, PUD_SIZE)) { + spin_lock(&init_mm.page_table_lock); + pud_clear(pud); + spin_unlock(&init_mm.page_table_lock); + pages++; + continue; + } + + pmd_base = pmd_offset(pud, 0); + remove_pmd_table(pmd_base, addr, next, direct, altmap); + free_pmd_table(pmd_base, pud); + } + + if (direct) + update_page_count(PG_LEVEL_1G, -pages); +} + +static void __meminit +remove_p4d_table(p4d_t *p4d_start, unsigned long addr, unsigned long end, + struct vmem_altmap *altmap, bool direct) +{ + unsigned long next, pages = 0; + pud_t *pud_base; + p4d_t *p4d; + + p4d = p4d_start + p4d_index(addr); + for (; addr < end; addr = next, p4d++) { + next = p4d_addr_end(addr, end); + + if (!p4d_present(*p4d)) + continue; + + BUILD_BUG_ON(p4d_large(*p4d)); + + pud_base = pud_offset(p4d, 0); + remove_pud_table(pud_base, addr, next, altmap, direct); + /* + * For 4-level page tables we do not want to free PUDs, but in the + * 5-level case we should free them. This code will have to change + * to adapt for boot-time switching between 4 and 5 level page tables. + */ + if (pgtable_l5_enabled()) + free_pud_table(pud_base, p4d); + } + + if (direct) + update_page_count(PG_LEVEL_512G, -pages); +} + +/* start and end are both virtual address. */ +static void __meminit +remove_pagetable(unsigned long start, unsigned long end, bool direct, + struct vmem_altmap *altmap) +{ + unsigned long next; + unsigned long addr; + pgd_t *pgd; + p4d_t *p4d; + + for (addr = start; addr < end; addr = next) { + next = pgd_addr_end(addr, end); + + pgd = pgd_offset_k(addr); + if (!pgd_present(*pgd)) + continue; + + p4d = p4d_offset(pgd, 0); + remove_p4d_table(p4d, addr, next, altmap, direct); + } + + flush_tlb_all(); +} + +void __ref vmemmap_free(unsigned long start, unsigned long end, + struct vmem_altmap *altmap) +{ + VM_BUG_ON(!PAGE_ALIGNED(start)); + VM_BUG_ON(!PAGE_ALIGNED(end)); + + remove_pagetable(start, end, false, altmap); +} + +static void __meminit +kernel_physical_mapping_remove(unsigned long start, unsigned long end) +{ + start = (unsigned long)__va(start); + end = (unsigned long)__va(end); + + remove_pagetable(start, end, true, NULL); +} + +void __ref arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap) +{ + unsigned long start_pfn = start >> PAGE_SHIFT; + unsigned long nr_pages = size >> PAGE_SHIFT; + + __remove_pages(start_pfn, nr_pages, altmap); + kernel_physical_mapping_remove(start, start + size); +} +#endif /* CONFIG_MEMORY_HOTPLUG */ + +static struct kcore_list kcore_vsyscall; + +static void __init register_page_bootmem_info(void) +{ +#if defined(CONFIG_NUMA) || defined(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP) + int i; + + for_each_online_node(i) + register_page_bootmem_info_node(NODE_DATA(i)); +#endif +} + +/* + * Pre-allocates page-table pages for the vmalloc area in the kernel page-table. + * Only the level which needs to be synchronized between all page-tables is + * allocated because the synchronization can be expensive. + */ +static void __init preallocate_vmalloc_pages(void) +{ + unsigned long addr; + const char *lvl; + + for (addr = VMALLOC_START; addr <= VMEMORY_END; addr = ALIGN(addr + 1, PGDIR_SIZE)) { + pgd_t *pgd = pgd_offset_k(addr); + p4d_t *p4d; + pud_t *pud; + + lvl = "p4d"; + p4d = p4d_alloc(&init_mm, pgd, addr); + if (!p4d) + goto failed; + + if (pgtable_l5_enabled()) + continue; + + /* + * The goal here is to allocate all possibly required + * hardware page tables pointed to by the top hardware + * level. + * + * On 4-level systems, the P4D layer is folded away and + * the above code does no preallocation. Below, go down + * to the pud _software_ level to ensure the second + * hardware level is allocated on 4-level systems too. + */ + lvl = "pud"; + pud = pud_alloc(&init_mm, p4d, addr); + if (!pud) + goto failed; + } + + return; + +failed: + + /* + * The pages have to be there now or they will be missing in + * process page-tables later. + */ + panic("Failed to pre-allocate %s pages for vmalloc area\n", lvl); +} + +void __init mem_init(void) +{ + pci_iommu_alloc(); + + /* clear_bss() already clear the empty_zero_page */ + + /* this will put all memory onto the freelists */ + memblock_free_all(); + after_bootmem = 1; + x86_init.hyper.init_after_bootmem(); + + /* + * Must be done after boot memory is put on freelist, because here we + * might set fields in deferred struct pages that have not yet been + * initialized, and memblock_free_all() initializes all the reserved + * deferred pages for us. + */ + register_page_bootmem_info(); + + /* Register memory areas for /proc/kcore */ + if (get_gate_vma(&init_mm)) + kclist_add(&kcore_vsyscall, (void *)VSYSCALL_ADDR, PAGE_SIZE, KCORE_USER); + + preallocate_vmalloc_pages(); +} + +#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT +int __init deferred_page_init_max_threads(const struct cpumask *node_cpumask) +{ + /* + * More CPUs always led to greater speedups on tested systems, up to + * all the nodes' CPUs. Use all since the system is otherwise idle + * now. + */ + return max_t(int, cpumask_weight(node_cpumask), 1); +} +#endif + +int kernel_set_to_readonly; + +void mark_rodata_ro(void) +{ + unsigned long start = PFN_ALIGN(_text); + unsigned long rodata_start = PFN_ALIGN(__start_rodata); + unsigned long end = (unsigned long)__end_rodata_hpage_align; + unsigned long text_end = PFN_ALIGN(_etext); + unsigned long rodata_end = PFN_ALIGN(__end_rodata); + unsigned long all_end; + + printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", + (end - start) >> 10); + set_memory_ro(start, (end - start) >> PAGE_SHIFT); + + kernel_set_to_readonly = 1; + + /* + * The rodata/data/bss/brk section (but not the kernel text!) + * should also be not-executable. + * + * We align all_end to PMD_SIZE because the existing mapping + * is a full PMD. If we would align _brk_end to PAGE_SIZE we + * split the PMD and the reminder between _brk_end and the end + * of the PMD will remain mapped executable. + * + * Any PMD which was setup after the one which covers _brk_end + * has been zapped already via cleanup_highmem(). + */ + all_end = roundup((unsigned long)_brk_end, PMD_SIZE); + set_memory_nx(text_end, (all_end - text_end) >> PAGE_SHIFT); + + set_ftrace_ops_ro(); + +#ifdef CONFIG_CPA_DEBUG + printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, end); + set_memory_rw(start, (end-start) >> PAGE_SHIFT); + + printk(KERN_INFO "Testing CPA: again\n"); + set_memory_ro(start, (end-start) >> PAGE_SHIFT); +#endif + + free_kernel_image_pages("unused kernel image (text/rodata gap)", + (void *)text_end, (void *)rodata_start); + free_kernel_image_pages("unused kernel image (rodata/data gap)", + (void *)rodata_end, (void *)_sdata); + + debug_checkwx(); +} + +/* + * Block size is the minimum amount of memory which can be hotplugged or + * hotremoved. It must be power of two and must be equal or larger than + * MIN_MEMORY_BLOCK_SIZE. + */ +#define MAX_BLOCK_SIZE (2UL << 30) + +/* Amount of ram needed to start using large blocks */ +#define MEM_SIZE_FOR_LARGE_BLOCK (64UL << 30) + +/* Adjustable memory block size */ +static unsigned long set_memory_block_size; +int __init set_memory_block_size_order(unsigned int order) +{ + unsigned long size = 1UL << order; + + if (size > MEM_SIZE_FOR_LARGE_BLOCK || size < MIN_MEMORY_BLOCK_SIZE) + return -EINVAL; + + set_memory_block_size = size; + return 0; +} + +static unsigned long probe_memory_block_size(void) +{ + unsigned long boot_mem_end = max_pfn << PAGE_SHIFT; + unsigned long bz; + + /* If memory block size has been set, then use it */ + bz = set_memory_block_size; + if (bz) + goto done; + + /* Use regular block if RAM is smaller than MEM_SIZE_FOR_LARGE_BLOCK */ + if (boot_mem_end < MEM_SIZE_FOR_LARGE_BLOCK) { + bz = MIN_MEMORY_BLOCK_SIZE; + goto done; + } + + /* + * Use max block size to minimize overhead on bare metal, where + * alignment for memory hotplug isn't a concern. + */ + if (!boot_cpu_has(X86_FEATURE_HYPERVISOR)) { + bz = MAX_BLOCK_SIZE; + goto done; + } + + /* Find the largest allowed block size that aligns to memory end */ + for (bz = MAX_BLOCK_SIZE; bz > MIN_MEMORY_BLOCK_SIZE; bz >>= 1) { + if (IS_ALIGNED(boot_mem_end, bz)) + break; + } +done: + pr_info("x86/mm: Memory block size: %ldMB\n", bz >> 20); + + return bz; +} + +static unsigned long memory_block_size_probed; +unsigned long memory_block_size_bytes(void) +{ + if (!memory_block_size_probed) + memory_block_size_probed = probe_memory_block_size(); + + return memory_block_size_probed; +} + +#ifdef CONFIG_SPARSEMEM_VMEMMAP +/* + * Initialise the sparsemem vmemmap using huge-pages at the PMD level. + */ +static long __meminitdata addr_start, addr_end; +static void __meminitdata *p_start, *p_end; +static int __meminitdata node_start; + +void __meminit vmemmap_set_pmd(pmd_t *pmd, void *p, int node, + unsigned long addr, unsigned long next) +{ + pte_t entry; + + entry = pfn_pte(__pa(p) >> PAGE_SHIFT, + PAGE_KERNEL_LARGE); + set_pmd(pmd, __pmd(pte_val(entry))); + + /* check to see if we have contiguous blocks */ + if (p_end != p || node_start != node) { + if (p_start) + pr_debug(" [%lx-%lx] PMD -> [%p-%p] on node %d\n", + addr_start, addr_end-1, p_start, p_end-1, node_start); + addr_start = addr; + node_start = node; + p_start = p; + } + + addr_end = addr + PMD_SIZE; + p_end = p + PMD_SIZE; + + if (!IS_ALIGNED(addr, PMD_SIZE) || + !IS_ALIGNED(next, PMD_SIZE)) + vmemmap_use_new_sub_pmd(addr, next); +} + +int __meminit vmemmap_check_pmd(pmd_t *pmd, int node, + unsigned long addr, unsigned long next) +{ + int large = pmd_large(*pmd); + + if (pmd_large(*pmd)) { + vmemmap_verify((pte_t *)pmd, node, addr, next); + vmemmap_use_sub_pmd(addr, next); + } + + return large; +} + +int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, + struct vmem_altmap *altmap) +{ + int err; + + VM_BUG_ON(!PAGE_ALIGNED(start)); + VM_BUG_ON(!PAGE_ALIGNED(end)); + + if (end - start < PAGES_PER_SECTION * sizeof(struct page)) + err = vmemmap_populate_basepages(start, end, node, NULL); + else if (boot_cpu_has(X86_FEATURE_PSE)) + err = vmemmap_populate_hugepages(start, end, node, altmap); + else if (altmap) { + pr_err_once("%s: no cpu support for altmap allocations\n", + __func__); + err = -ENOMEM; + } else + err = vmemmap_populate_basepages(start, end, node, NULL); + if (!err) + sync_global_pgds(start, end - 1); + return err; +} + +#ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE +void register_page_bootmem_memmap(unsigned long section_nr, + struct page *start_page, unsigned long nr_pages) +{ + unsigned long addr = (unsigned long)start_page; + unsigned long end = (unsigned long)(start_page + nr_pages); + unsigned long next; + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + unsigned int nr_pmd_pages; + struct page *page; + + for (; addr < end; addr = next) { + pte_t *pte = NULL; + + pgd = pgd_offset_k(addr); + if (pgd_none(*pgd)) { + next = (addr + PAGE_SIZE) & PAGE_MASK; + continue; + } + get_page_bootmem(section_nr, pgd_page(*pgd), MIX_SECTION_INFO); + + p4d = p4d_offset(pgd, addr); + if (p4d_none(*p4d)) { + next = (addr + PAGE_SIZE) & PAGE_MASK; + continue; + } + get_page_bootmem(section_nr, p4d_page(*p4d), MIX_SECTION_INFO); + + pud = pud_offset(p4d, addr); + if (pud_none(*pud)) { + next = (addr + PAGE_SIZE) & PAGE_MASK; + continue; + } + get_page_bootmem(section_nr, pud_page(*pud), MIX_SECTION_INFO); + + if (!boot_cpu_has(X86_FEATURE_PSE)) { + next = (addr + PAGE_SIZE) & PAGE_MASK; + pmd = pmd_offset(pud, addr); + if (pmd_none(*pmd)) + continue; + get_page_bootmem(section_nr, pmd_page(*pmd), + MIX_SECTION_INFO); + + pte = pte_offset_kernel(pmd, addr); + if (pte_none(*pte)) + continue; + get_page_bootmem(section_nr, pte_page(*pte), + SECTION_INFO); + } else { + next = pmd_addr_end(addr, end); + + pmd = pmd_offset(pud, addr); + if (pmd_none(*pmd)) + continue; + + nr_pmd_pages = 1 << get_order(PMD_SIZE); + page = pmd_page(*pmd); + while (nr_pmd_pages--) + get_page_bootmem(section_nr, page++, + SECTION_INFO); + } + } +} +#endif + +void __meminit vmemmap_populate_print_last(void) +{ + if (p_start) { + pr_debug(" [%lx-%lx] PMD -> [%p-%p] on node %d\n", + addr_start, addr_end-1, p_start, p_end-1, node_start); + p_start = NULL; + p_end = NULL; + node_start = 0; + } +} +#endif diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c new file mode 100644 index 0000000000..9aaa756ddf --- /dev/null +++ b/arch/x86/mm/iomap_32.c @@ -0,0 +1,65 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright © 2008 Ingo Molnar + */ + +#include <asm/iomap.h> +#include <asm/memtype.h> +#include <linux/export.h> +#include <linux/highmem.h> + +static int is_io_mapping_possible(resource_size_t base, unsigned long size) +{ +#if !defined(CONFIG_X86_PAE) && defined(CONFIG_PHYS_ADDR_T_64BIT) + /* There is no way to map greater than 1 << 32 address without PAE */ + if (base + size > 0x100000000ULL) + return 0; +#endif + return 1; +} + +int iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot) +{ + enum page_cache_mode pcm = _PAGE_CACHE_MODE_WC; + int ret; + + if (!is_io_mapping_possible(base, size)) + return -EINVAL; + + ret = memtype_reserve_io(base, base + size, &pcm); + if (ret) + return ret; + + *prot = __pgprot(__PAGE_KERNEL | cachemode2protval(pcm)); + /* Filter out unsupported __PAGE_KERNEL* bits: */ + pgprot_val(*prot) &= __default_kernel_pte_mask; + + return 0; +} +EXPORT_SYMBOL_GPL(iomap_create_wc); + +void iomap_free(resource_size_t base, unsigned long size) +{ + memtype_free_io(base, base + size); +} +EXPORT_SYMBOL_GPL(iomap_free); + +void __iomem *__iomap_local_pfn_prot(unsigned long pfn, pgprot_t prot) +{ + /* + * For non-PAT systems, translate non-WB request to UC- just in + * case the caller set the PWT bit to prot directly without using + * pgprot_writecombine(). UC- translates to uncached if the MTRR + * is UC or WC. UC- gets the real intention, of the user, which is + * "WC if the MTRR is WC, UC if you can't do that." + */ + if (!pat_enabled() && pgprot2cachemode(prot) != _PAGE_CACHE_MODE_WB) + prot = __pgprot(__PAGE_KERNEL | + cachemode2protval(_PAGE_CACHE_MODE_UC_MINUS)); + + /* Filter out unsupported __PAGE_KERNEL* bits: */ + pgprot_val(prot) &= __default_kernel_pte_mask; + + return (void __force __iomem *)__kmap_local_pfn_prot(pfn, prot); +} +EXPORT_SYMBOL_GPL(__iomap_local_pfn_prot); diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c new file mode 100644 index 0000000000..aa7d279321 --- /dev/null +++ b/arch/x86/mm/ioremap.c @@ -0,0 +1,936 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Re-map IO memory to kernel address space so that we can access it. + * This is needed for high PCI addresses that aren't mapped in the + * 640k-1MB IO memory area on PC's + * + * (C) Copyright 1995 1996 Linus Torvalds + */ + +#include <linux/memblock.h> +#include <linux/init.h> +#include <linux/io.h> +#include <linux/ioport.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <linux/mmiotrace.h> +#include <linux/cc_platform.h> +#include <linux/efi.h> +#include <linux/pgtable.h> +#include <linux/kmsan.h> + +#include <asm/set_memory.h> +#include <asm/e820/api.h> +#include <asm/efi.h> +#include <asm/fixmap.h> +#include <asm/tlbflush.h> +#include <asm/pgalloc.h> +#include <asm/memtype.h> +#include <asm/setup.h> + +#include "physaddr.h" + +/* + * Descriptor controlling ioremap() behavior. + */ +struct ioremap_desc { + unsigned int flags; +}; + +/* + * Fix up the linear direct mapping of the kernel to avoid cache attribute + * conflicts. + */ +int ioremap_change_attr(unsigned long vaddr, unsigned long size, + enum page_cache_mode pcm) +{ + unsigned long nrpages = size >> PAGE_SHIFT; + int err; + + switch (pcm) { + case _PAGE_CACHE_MODE_UC: + default: + err = _set_memory_uc(vaddr, nrpages); + break; + case _PAGE_CACHE_MODE_WC: + err = _set_memory_wc(vaddr, nrpages); + break; + case _PAGE_CACHE_MODE_WT: + err = _set_memory_wt(vaddr, nrpages); + break; + case _PAGE_CACHE_MODE_WB: + err = _set_memory_wb(vaddr, nrpages); + break; + } + + return err; +} + +/* Does the range (or a subset of) contain normal RAM? */ +static unsigned int __ioremap_check_ram(struct resource *res) +{ + unsigned long start_pfn, stop_pfn; + unsigned long i; + + if ((res->flags & IORESOURCE_SYSTEM_RAM) != IORESOURCE_SYSTEM_RAM) + return 0; + + start_pfn = (res->start + PAGE_SIZE - 1) >> PAGE_SHIFT; + stop_pfn = (res->end + 1) >> PAGE_SHIFT; + if (stop_pfn > start_pfn) { + for (i = 0; i < (stop_pfn - start_pfn); ++i) + if (pfn_valid(start_pfn + i) && + !PageReserved(pfn_to_page(start_pfn + i))) + return IORES_MAP_SYSTEM_RAM; + } + + return 0; +} + +/* + * In a SEV guest, NONE and RESERVED should not be mapped encrypted because + * there the whole memory is already encrypted. + */ +static unsigned int __ioremap_check_encrypted(struct resource *res) +{ + if (!cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) + return 0; + + switch (res->desc) { + case IORES_DESC_NONE: + case IORES_DESC_RESERVED: + break; + default: + return IORES_MAP_ENCRYPTED; + } + + return 0; +} + +/* + * The EFI runtime services data area is not covered by walk_mem_res(), but must + * be mapped encrypted when SEV is active. + */ +static void __ioremap_check_other(resource_size_t addr, struct ioremap_desc *desc) +{ + if (!cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) + return; + + if (x86_platform.hyper.is_private_mmio(addr)) { + desc->flags |= IORES_MAP_ENCRYPTED; + return; + } + + if (!IS_ENABLED(CONFIG_EFI)) + return; + + if (efi_mem_type(addr) == EFI_RUNTIME_SERVICES_DATA || + (efi_mem_type(addr) == EFI_BOOT_SERVICES_DATA && + efi_mem_attributes(addr) & EFI_MEMORY_RUNTIME)) + desc->flags |= IORES_MAP_ENCRYPTED; +} + +static int __ioremap_collect_map_flags(struct resource *res, void *arg) +{ + struct ioremap_desc *desc = arg; + + if (!(desc->flags & IORES_MAP_SYSTEM_RAM)) + desc->flags |= __ioremap_check_ram(res); + + if (!(desc->flags & IORES_MAP_ENCRYPTED)) + desc->flags |= __ioremap_check_encrypted(res); + + return ((desc->flags & (IORES_MAP_SYSTEM_RAM | IORES_MAP_ENCRYPTED)) == + (IORES_MAP_SYSTEM_RAM | IORES_MAP_ENCRYPTED)); +} + +/* + * To avoid multiple resource walks, this function walks resources marked as + * IORESOURCE_MEM and IORESOURCE_BUSY and looking for system RAM and/or a + * resource described not as IORES_DESC_NONE (e.g. IORES_DESC_ACPI_TABLES). + * + * After that, deal with misc other ranges in __ioremap_check_other() which do + * not fall into the above category. + */ +static void __ioremap_check_mem(resource_size_t addr, unsigned long size, + struct ioremap_desc *desc) +{ + u64 start, end; + + start = (u64)addr; + end = start + size - 1; + memset(desc, 0, sizeof(struct ioremap_desc)); + + walk_mem_res(start, end, desc, __ioremap_collect_map_flags); + + __ioremap_check_other(addr, desc); +} + +/* + * Remap an arbitrary physical address space into the kernel virtual + * address space. It transparently creates kernel huge I/O mapping when + * the physical address is aligned by a huge page size (1GB or 2MB) and + * the requested size is at least the huge page size. + * + * NOTE: MTRRs can override PAT memory types with a 4KB granularity. + * Therefore, the mapping code falls back to use a smaller page toward 4KB + * when a mapping range is covered by non-WB type of MTRRs. + * + * NOTE! We need to allow non-page-aligned mappings too: we will obviously + * have to convert them into an offset in a page-aligned mapping, but the + * caller shouldn't need to know that small detail. + */ +static void __iomem * +__ioremap_caller(resource_size_t phys_addr, unsigned long size, + enum page_cache_mode pcm, void *caller, bool encrypted) +{ + unsigned long offset, vaddr; + resource_size_t last_addr; + const resource_size_t unaligned_phys_addr = phys_addr; + const unsigned long unaligned_size = size; + struct ioremap_desc io_desc; + struct vm_struct *area; + enum page_cache_mode new_pcm; + pgprot_t prot; + int retval; + void __iomem *ret_addr; + + /* Don't allow wraparound or zero size */ + last_addr = phys_addr + size - 1; + if (!size || last_addr < phys_addr) + return NULL; + + if (!phys_addr_valid(phys_addr)) { + printk(KERN_WARNING "ioremap: invalid physical address %llx\n", + (unsigned long long)phys_addr); + WARN_ON_ONCE(1); + return NULL; + } + + __ioremap_check_mem(phys_addr, size, &io_desc); + + /* + * Don't allow anybody to remap normal RAM that we're using.. + */ + if (io_desc.flags & IORES_MAP_SYSTEM_RAM) { + WARN_ONCE(1, "ioremap on RAM at %pa - %pa\n", + &phys_addr, &last_addr); + return NULL; + } + + /* + * Mappings have to be page-aligned + */ + offset = phys_addr & ~PAGE_MASK; + phys_addr &= PAGE_MASK; + size = PAGE_ALIGN(last_addr+1) - phys_addr; + + /* + * Mask out any bits not part of the actual physical + * address, like memory encryption bits. + */ + phys_addr &= PHYSICAL_PAGE_MASK; + + retval = memtype_reserve(phys_addr, (u64)phys_addr + size, + pcm, &new_pcm); + if (retval) { + printk(KERN_ERR "ioremap memtype_reserve failed %d\n", retval); + return NULL; + } + + if (pcm != new_pcm) { + if (!is_new_memtype_allowed(phys_addr, size, pcm, new_pcm)) { + printk(KERN_ERR + "ioremap error for 0x%llx-0x%llx, requested 0x%x, got 0x%x\n", + (unsigned long long)phys_addr, + (unsigned long long)(phys_addr + size), + pcm, new_pcm); + goto err_free_memtype; + } + pcm = new_pcm; + } + + /* + * If the page being mapped is in memory and SEV is active then + * make sure the memory encryption attribute is enabled in the + * resulting mapping. + * In TDX guests, memory is marked private by default. If encryption + * is not requested (using encrypted), explicitly set decrypt + * attribute in all IOREMAPPED memory. + */ + prot = PAGE_KERNEL_IO; + if ((io_desc.flags & IORES_MAP_ENCRYPTED) || encrypted) + prot = pgprot_encrypted(prot); + else + prot = pgprot_decrypted(prot); + + switch (pcm) { + case _PAGE_CACHE_MODE_UC: + default: + prot = __pgprot(pgprot_val(prot) | + cachemode2protval(_PAGE_CACHE_MODE_UC)); + break; + case _PAGE_CACHE_MODE_UC_MINUS: + prot = __pgprot(pgprot_val(prot) | + cachemode2protval(_PAGE_CACHE_MODE_UC_MINUS)); + break; + case _PAGE_CACHE_MODE_WC: + prot = __pgprot(pgprot_val(prot) | + cachemode2protval(_PAGE_CACHE_MODE_WC)); + break; + case _PAGE_CACHE_MODE_WT: + prot = __pgprot(pgprot_val(prot) | + cachemode2protval(_PAGE_CACHE_MODE_WT)); + break; + case _PAGE_CACHE_MODE_WB: + break; + } + + /* + * Ok, go for it.. + */ + area = get_vm_area_caller(size, VM_IOREMAP, caller); + if (!area) + goto err_free_memtype; + area->phys_addr = phys_addr; + vaddr = (unsigned long) area->addr; + + if (memtype_kernel_map_sync(phys_addr, size, pcm)) + goto err_free_area; + + if (ioremap_page_range(vaddr, vaddr + size, phys_addr, prot)) + goto err_free_area; + + ret_addr = (void __iomem *) (vaddr + offset); + mmiotrace_ioremap(unaligned_phys_addr, unaligned_size, ret_addr); + + /* + * Check if the request spans more than any BAR in the iomem resource + * tree. + */ + if (iomem_map_sanity_check(unaligned_phys_addr, unaligned_size)) + pr_warn("caller %pS mapping multiple BARs\n", caller); + + return ret_addr; +err_free_area: + free_vm_area(area); +err_free_memtype: + memtype_free(phys_addr, phys_addr + size); + return NULL; +} + +/** + * ioremap - map bus memory into CPU space + * @phys_addr: bus address of the memory + * @size: size of the resource to map + * + * ioremap performs a platform specific sequence of operations to + * make bus memory CPU accessible via the readb/readw/readl/writeb/ + * writew/writel functions and the other mmio helpers. The returned + * address is not guaranteed to be usable directly as a virtual + * address. + * + * This version of ioremap ensures that the memory is marked uncachable + * on the CPU as well as honouring existing caching rules from things like + * the PCI bus. Note that there are other caches and buffers on many + * busses. In particular driver authors should read up on PCI writes + * + * It's useful if some control registers are in such an area and + * write combining or read caching is not desirable: + * + * Must be freed with iounmap. + */ +void __iomem *ioremap(resource_size_t phys_addr, unsigned long size) +{ + /* + * Ideally, this should be: + * pat_enabled() ? _PAGE_CACHE_MODE_UC : _PAGE_CACHE_MODE_UC_MINUS; + * + * Till we fix all X drivers to use ioremap_wc(), we will use + * UC MINUS. Drivers that are certain they need or can already + * be converted over to strong UC can use ioremap_uc(). + */ + enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC_MINUS; + + return __ioremap_caller(phys_addr, size, pcm, + __builtin_return_address(0), false); +} +EXPORT_SYMBOL(ioremap); + +/** + * ioremap_uc - map bus memory into CPU space as strongly uncachable + * @phys_addr: bus address of the memory + * @size: size of the resource to map + * + * ioremap_uc performs a platform specific sequence of operations to + * make bus memory CPU accessible via the readb/readw/readl/writeb/ + * writew/writel functions and the other mmio helpers. The returned + * address is not guaranteed to be usable directly as a virtual + * address. + * + * This version of ioremap ensures that the memory is marked with a strong + * preference as completely uncachable on the CPU when possible. For non-PAT + * systems this ends up setting page-attribute flags PCD=1, PWT=1. For PAT + * systems this will set the PAT entry for the pages as strong UC. This call + * will honor existing caching rules from things like the PCI bus. Note that + * there are other caches and buffers on many busses. In particular driver + * authors should read up on PCI writes. + * + * It's useful if some control registers are in such an area and + * write combining or read caching is not desirable: + * + * Must be freed with iounmap. + */ +void __iomem *ioremap_uc(resource_size_t phys_addr, unsigned long size) +{ + enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC; + + return __ioremap_caller(phys_addr, size, pcm, + __builtin_return_address(0), false); +} +EXPORT_SYMBOL_GPL(ioremap_uc); + +/** + * ioremap_wc - map memory into CPU space write combined + * @phys_addr: bus address of the memory + * @size: size of the resource to map + * + * This version of ioremap ensures that the memory is marked write combining. + * Write combining allows faster writes to some hardware devices. + * + * Must be freed with iounmap. + */ +void __iomem *ioremap_wc(resource_size_t phys_addr, unsigned long size) +{ + return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WC, + __builtin_return_address(0), false); +} +EXPORT_SYMBOL(ioremap_wc); + +/** + * ioremap_wt - map memory into CPU space write through + * @phys_addr: bus address of the memory + * @size: size of the resource to map + * + * This version of ioremap ensures that the memory is marked write through. + * Write through stores data into memory while keeping the cache up-to-date. + * + * Must be freed with iounmap. + */ +void __iomem *ioremap_wt(resource_size_t phys_addr, unsigned long size) +{ + return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WT, + __builtin_return_address(0), false); +} +EXPORT_SYMBOL(ioremap_wt); + +void __iomem *ioremap_encrypted(resource_size_t phys_addr, unsigned long size) +{ + return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WB, + __builtin_return_address(0), true); +} +EXPORT_SYMBOL(ioremap_encrypted); + +void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size) +{ + return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WB, + __builtin_return_address(0), false); +} +EXPORT_SYMBOL(ioremap_cache); + +void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size, + unsigned long prot_val) +{ + return __ioremap_caller(phys_addr, size, + pgprot2cachemode(__pgprot(prot_val)), + __builtin_return_address(0), false); +} +EXPORT_SYMBOL(ioremap_prot); + +/** + * iounmap - Free a IO remapping + * @addr: virtual address from ioremap_* + * + * Caller must ensure there is only one unmapping for the same pointer. + */ +void iounmap(volatile void __iomem *addr) +{ + struct vm_struct *p, *o; + + if ((void __force *)addr <= high_memory) + return; + + /* + * The PCI/ISA range special-casing was removed from __ioremap() + * so this check, in theory, can be removed. However, there are + * cases where iounmap() is called for addresses not obtained via + * ioremap() (vga16fb for example). Add a warning so that these + * cases can be caught and fixed. + */ + if ((void __force *)addr >= phys_to_virt(ISA_START_ADDRESS) && + (void __force *)addr < phys_to_virt(ISA_END_ADDRESS)) { + WARN(1, "iounmap() called for ISA range not obtained using ioremap()\n"); + return; + } + + mmiotrace_iounmap(addr); + + addr = (volatile void __iomem *) + (PAGE_MASK & (unsigned long __force)addr); + + /* Use the vm area unlocked, assuming the caller + ensures there isn't another iounmap for the same address + in parallel. Reuse of the virtual address is prevented by + leaving it in the global lists until we're done with it. + cpa takes care of the direct mappings. */ + p = find_vm_area((void __force *)addr); + + if (!p) { + printk(KERN_ERR "iounmap: bad address %p\n", addr); + dump_stack(); + return; + } + + kmsan_iounmap_page_range((unsigned long)addr, + (unsigned long)addr + get_vm_area_size(p)); + memtype_free(p->phys_addr, p->phys_addr + get_vm_area_size(p)); + + /* Finally remove it */ + o = remove_vm_area((void __force *)addr); + BUG_ON(p != o || o == NULL); + kfree(p); +} +EXPORT_SYMBOL(iounmap); + +/* + * Convert a physical pointer to a virtual kernel pointer for /dev/mem + * access + */ +void *xlate_dev_mem_ptr(phys_addr_t phys) +{ + unsigned long start = phys & PAGE_MASK; + unsigned long offset = phys & ~PAGE_MASK; + void *vaddr; + + /* memremap() maps if RAM, otherwise falls back to ioremap() */ + vaddr = memremap(start, PAGE_SIZE, MEMREMAP_WB); + + /* Only add the offset on success and return NULL if memremap() failed */ + if (vaddr) + vaddr += offset; + + return vaddr; +} + +void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr) +{ + memunmap((void *)((unsigned long)addr & PAGE_MASK)); +} + +#ifdef CONFIG_AMD_MEM_ENCRYPT +/* + * Examine the physical address to determine if it is an area of memory + * that should be mapped decrypted. If the memory is not part of the + * kernel usable area it was accessed and created decrypted, so these + * areas should be mapped decrypted. And since the encryption key can + * change across reboots, persistent memory should also be mapped + * decrypted. + * + * If SEV is active, that implies that BIOS/UEFI also ran encrypted so + * only persistent memory should be mapped decrypted. + */ +static bool memremap_should_map_decrypted(resource_size_t phys_addr, + unsigned long size) +{ + int is_pmem; + + /* + * Check if the address is part of a persistent memory region. + * This check covers areas added by E820, EFI and ACPI. + */ + is_pmem = region_intersects(phys_addr, size, IORESOURCE_MEM, + IORES_DESC_PERSISTENT_MEMORY); + if (is_pmem != REGION_DISJOINT) + return true; + + /* + * Check if the non-volatile attribute is set for an EFI + * reserved area. + */ + if (efi_enabled(EFI_BOOT)) { + switch (efi_mem_type(phys_addr)) { + case EFI_RESERVED_TYPE: + if (efi_mem_attributes(phys_addr) & EFI_MEMORY_NV) + return true; + break; + default: + break; + } + } + + /* Check if the address is outside kernel usable area */ + switch (e820__get_entry_type(phys_addr, phys_addr + size - 1)) { + case E820_TYPE_RESERVED: + case E820_TYPE_ACPI: + case E820_TYPE_NVS: + case E820_TYPE_UNUSABLE: + /* For SEV, these areas are encrypted */ + if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) + break; + fallthrough; + + case E820_TYPE_PRAM: + return true; + default: + break; + } + + return false; +} + +/* + * Examine the physical address to determine if it is EFI data. Check + * it against the boot params structure and EFI tables and memory types. + */ +static bool memremap_is_efi_data(resource_size_t phys_addr, + unsigned long size) +{ + u64 paddr; + + /* Check if the address is part of EFI boot/runtime data */ + if (!efi_enabled(EFI_BOOT)) + return false; + + paddr = boot_params.efi_info.efi_memmap_hi; + paddr <<= 32; + paddr |= boot_params.efi_info.efi_memmap; + if (phys_addr == paddr) + return true; + + paddr = boot_params.efi_info.efi_systab_hi; + paddr <<= 32; + paddr |= boot_params.efi_info.efi_systab; + if (phys_addr == paddr) + return true; + + if (efi_is_table_address(phys_addr)) + return true; + + switch (efi_mem_type(phys_addr)) { + case EFI_BOOT_SERVICES_DATA: + case EFI_RUNTIME_SERVICES_DATA: + return true; + default: + break; + } + + return false; +} + +/* + * Examine the physical address to determine if it is boot data by checking + * it against the boot params setup_data chain. + */ +static bool memremap_is_setup_data(resource_size_t phys_addr, + unsigned long size) +{ + struct setup_indirect *indirect; + struct setup_data *data; + u64 paddr, paddr_next; + + paddr = boot_params.hdr.setup_data; + while (paddr) { + unsigned int len; + + if (phys_addr == paddr) + return true; + + data = memremap(paddr, sizeof(*data), + MEMREMAP_WB | MEMREMAP_DEC); + if (!data) { + pr_warn("failed to memremap setup_data entry\n"); + return false; + } + + paddr_next = data->next; + len = data->len; + + if ((phys_addr > paddr) && (phys_addr < (paddr + len))) { + memunmap(data); + return true; + } + + if (data->type == SETUP_INDIRECT) { + memunmap(data); + data = memremap(paddr, sizeof(*data) + len, + MEMREMAP_WB | MEMREMAP_DEC); + if (!data) { + pr_warn("failed to memremap indirect setup_data\n"); + return false; + } + + indirect = (struct setup_indirect *)data->data; + + if (indirect->type != SETUP_INDIRECT) { + paddr = indirect->addr; + len = indirect->len; + } + } + + memunmap(data); + + if ((phys_addr > paddr) && (phys_addr < (paddr + len))) + return true; + + paddr = paddr_next; + } + + return false; +} + +/* + * Examine the physical address to determine if it is boot data by checking + * it against the boot params setup_data chain (early boot version). + */ +static bool __init early_memremap_is_setup_data(resource_size_t phys_addr, + unsigned long size) +{ + struct setup_indirect *indirect; + struct setup_data *data; + u64 paddr, paddr_next; + + paddr = boot_params.hdr.setup_data; + while (paddr) { + unsigned int len, size; + + if (phys_addr == paddr) + return true; + + data = early_memremap_decrypted(paddr, sizeof(*data)); + if (!data) { + pr_warn("failed to early memremap setup_data entry\n"); + return false; + } + + size = sizeof(*data); + + paddr_next = data->next; + len = data->len; + + if ((phys_addr > paddr) && (phys_addr < (paddr + len))) { + early_memunmap(data, sizeof(*data)); + return true; + } + + if (data->type == SETUP_INDIRECT) { + size += len; + early_memunmap(data, sizeof(*data)); + data = early_memremap_decrypted(paddr, size); + if (!data) { + pr_warn("failed to early memremap indirect setup_data\n"); + return false; + } + + indirect = (struct setup_indirect *)data->data; + + if (indirect->type != SETUP_INDIRECT) { + paddr = indirect->addr; + len = indirect->len; + } + } + + early_memunmap(data, size); + + if ((phys_addr > paddr) && (phys_addr < (paddr + len))) + return true; + + paddr = paddr_next; + } + + return false; +} + +/* + * Architecture function to determine if RAM remap is allowed. By default, a + * RAM remap will map the data as encrypted. Determine if a RAM remap should + * not be done so that the data will be mapped decrypted. + */ +bool arch_memremap_can_ram_remap(resource_size_t phys_addr, unsigned long size, + unsigned long flags) +{ + if (!cc_platform_has(CC_ATTR_MEM_ENCRYPT)) + return true; + + if (flags & MEMREMAP_ENC) + return true; + + if (flags & MEMREMAP_DEC) + return false; + + if (cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) { + if (memremap_is_setup_data(phys_addr, size) || + memremap_is_efi_data(phys_addr, size)) + return false; + } + + return !memremap_should_map_decrypted(phys_addr, size); +} + +/* + * Architecture override of __weak function to adjust the protection attributes + * used when remapping memory. By default, early_memremap() will map the data + * as encrypted. Determine if an encrypted mapping should not be done and set + * the appropriate protection attributes. + */ +pgprot_t __init early_memremap_pgprot_adjust(resource_size_t phys_addr, + unsigned long size, + pgprot_t prot) +{ + bool encrypted_prot; + + if (!cc_platform_has(CC_ATTR_MEM_ENCRYPT)) + return prot; + + encrypted_prot = true; + + if (cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) { + if (early_memremap_is_setup_data(phys_addr, size) || + memremap_is_efi_data(phys_addr, size)) + encrypted_prot = false; + } + + if (encrypted_prot && memremap_should_map_decrypted(phys_addr, size)) + encrypted_prot = false; + + return encrypted_prot ? pgprot_encrypted(prot) + : pgprot_decrypted(prot); +} + +bool phys_mem_access_encrypted(unsigned long phys_addr, unsigned long size) +{ + return arch_memremap_can_ram_remap(phys_addr, size, 0); +} + +/* Remap memory with encryption */ +void __init *early_memremap_encrypted(resource_size_t phys_addr, + unsigned long size) +{ + return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_ENC); +} + +/* + * Remap memory with encryption and write-protected - cannot be called + * before pat_init() is called + */ +void __init *early_memremap_encrypted_wp(resource_size_t phys_addr, + unsigned long size) +{ + if (!x86_has_pat_wp()) + return NULL; + return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_ENC_WP); +} + +/* Remap memory without encryption */ +void __init *early_memremap_decrypted(resource_size_t phys_addr, + unsigned long size) +{ + return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_NOENC); +} + +/* + * Remap memory without encryption and write-protected - cannot be called + * before pat_init() is called + */ +void __init *early_memremap_decrypted_wp(resource_size_t phys_addr, + unsigned long size) +{ + if (!x86_has_pat_wp()) + return NULL; + return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_NOENC_WP); +} +#endif /* CONFIG_AMD_MEM_ENCRYPT */ + +static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss; + +static inline pmd_t * __init early_ioremap_pmd(unsigned long addr) +{ + /* Don't assume we're using swapper_pg_dir at this point */ + pgd_t *base = __va(read_cr3_pa()); + pgd_t *pgd = &base[pgd_index(addr)]; + p4d_t *p4d = p4d_offset(pgd, addr); + pud_t *pud = pud_offset(p4d, addr); + pmd_t *pmd = pmd_offset(pud, addr); + + return pmd; +} + +static inline pte_t * __init early_ioremap_pte(unsigned long addr) +{ + return &bm_pte[pte_index(addr)]; +} + +bool __init is_early_ioremap_ptep(pte_t *ptep) +{ + return ptep >= &bm_pte[0] && ptep < &bm_pte[PAGE_SIZE/sizeof(pte_t)]; +} + +void __init early_ioremap_init(void) +{ + pmd_t *pmd; + +#ifdef CONFIG_X86_64 + BUILD_BUG_ON((fix_to_virt(0) + PAGE_SIZE) & ((1 << PMD_SHIFT) - 1)); +#else + WARN_ON((fix_to_virt(0) + PAGE_SIZE) & ((1 << PMD_SHIFT) - 1)); +#endif + + early_ioremap_setup(); + + pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN)); + memset(bm_pte, 0, sizeof(bm_pte)); + pmd_populate_kernel(&init_mm, pmd, bm_pte); + + /* + * The boot-ioremap range spans multiple pmds, for which + * we are not prepared: + */ +#define __FIXADDR_TOP (-PAGE_SIZE) + BUILD_BUG_ON((__fix_to_virt(FIX_BTMAP_BEGIN) >> PMD_SHIFT) + != (__fix_to_virt(FIX_BTMAP_END) >> PMD_SHIFT)); +#undef __FIXADDR_TOP + if (pmd != early_ioremap_pmd(fix_to_virt(FIX_BTMAP_END))) { + WARN_ON(1); + printk(KERN_WARNING "pmd %p != %p\n", + pmd, early_ioremap_pmd(fix_to_virt(FIX_BTMAP_END))); + printk(KERN_WARNING "fix_to_virt(FIX_BTMAP_BEGIN): %08lx\n", + fix_to_virt(FIX_BTMAP_BEGIN)); + printk(KERN_WARNING "fix_to_virt(FIX_BTMAP_END): %08lx\n", + fix_to_virt(FIX_BTMAP_END)); + + printk(KERN_WARNING "FIX_BTMAP_END: %d\n", FIX_BTMAP_END); + printk(KERN_WARNING "FIX_BTMAP_BEGIN: %d\n", + FIX_BTMAP_BEGIN); + } +} + +void __init __early_set_fixmap(enum fixed_addresses idx, + phys_addr_t phys, pgprot_t flags) +{ + unsigned long addr = __fix_to_virt(idx); + pte_t *pte; + + if (idx >= __end_of_fixed_addresses) { + BUG(); + return; + } + pte = early_ioremap_pte(addr); + + /* Sanitize 'prot' against any unsupported bits: */ + pgprot_val(flags) &= __supported_pte_mask; + + if (pgprot_val(flags)) + set_pte(pte, pfn_pte(phys >> PAGE_SHIFT, flags)); + else + pte_clear(&init_mm, addr, pte); + flush_tlb_one_kernel(addr); +} diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c new file mode 100644 index 0000000000..0302491d79 --- /dev/null +++ b/arch/x86/mm/kasan_init_64.c @@ -0,0 +1,456 @@ +// SPDX-License-Identifier: GPL-2.0 +#define DISABLE_BRANCH_PROFILING +#define pr_fmt(fmt) "kasan: " fmt + +/* cpu_feature_enabled() cannot be used this early */ +#define USE_EARLY_PGTABLE_L5 + +#include <linux/memblock.h> +#include <linux/kasan.h> +#include <linux/kdebug.h> +#include <linux/mm.h> +#include <linux/sched.h> +#include <linux/sched/task.h> +#include <linux/vmalloc.h> + +#include <asm/e820/types.h> +#include <asm/pgalloc.h> +#include <asm/tlbflush.h> +#include <asm/sections.h> +#include <asm/cpu_entry_area.h> + +extern struct range pfn_mapped[E820_MAX_ENTRIES]; + +static p4d_t tmp_p4d_table[MAX_PTRS_PER_P4D] __initdata __aligned(PAGE_SIZE); + +static __init void *early_alloc(size_t size, int nid, bool should_panic) +{ + void *ptr = memblock_alloc_try_nid(size, size, + __pa(MAX_DMA_ADDRESS), MEMBLOCK_ALLOC_ACCESSIBLE, nid); + + if (!ptr && should_panic) + panic("%pS: Failed to allocate page, nid=%d from=%lx\n", + (void *)_RET_IP_, nid, __pa(MAX_DMA_ADDRESS)); + + return ptr; +} + +static void __init kasan_populate_pmd(pmd_t *pmd, unsigned long addr, + unsigned long end, int nid) +{ + pte_t *pte; + + if (pmd_none(*pmd)) { + void *p; + + if (boot_cpu_has(X86_FEATURE_PSE) && + ((end - addr) == PMD_SIZE) && + IS_ALIGNED(addr, PMD_SIZE)) { + p = early_alloc(PMD_SIZE, nid, false); + if (p && pmd_set_huge(pmd, __pa(p), PAGE_KERNEL)) + return; + memblock_free(p, PMD_SIZE); + } + + p = early_alloc(PAGE_SIZE, nid, true); + pmd_populate_kernel(&init_mm, pmd, p); + } + + pte = pte_offset_kernel(pmd, addr); + do { + pte_t entry; + void *p; + + if (!pte_none(*pte)) + continue; + + p = early_alloc(PAGE_SIZE, nid, true); + entry = pfn_pte(PFN_DOWN(__pa(p)), PAGE_KERNEL); + set_pte_at(&init_mm, addr, pte, entry); + } while (pte++, addr += PAGE_SIZE, addr != end); +} + +static void __init kasan_populate_pud(pud_t *pud, unsigned long addr, + unsigned long end, int nid) +{ + pmd_t *pmd; + unsigned long next; + + if (pud_none(*pud)) { + void *p; + + if (boot_cpu_has(X86_FEATURE_GBPAGES) && + ((end - addr) == PUD_SIZE) && + IS_ALIGNED(addr, PUD_SIZE)) { + p = early_alloc(PUD_SIZE, nid, false); + if (p && pud_set_huge(pud, __pa(p), PAGE_KERNEL)) + return; + memblock_free(p, PUD_SIZE); + } + + p = early_alloc(PAGE_SIZE, nid, true); + pud_populate(&init_mm, pud, p); + } + + pmd = pmd_offset(pud, addr); + do { + next = pmd_addr_end(addr, end); + if (!pmd_large(*pmd)) + kasan_populate_pmd(pmd, addr, next, nid); + } while (pmd++, addr = next, addr != end); +} + +static void __init kasan_populate_p4d(p4d_t *p4d, unsigned long addr, + unsigned long end, int nid) +{ + pud_t *pud; + unsigned long next; + + if (p4d_none(*p4d)) { + void *p = early_alloc(PAGE_SIZE, nid, true); + + p4d_populate(&init_mm, p4d, p); + } + + pud = pud_offset(p4d, addr); + do { + next = pud_addr_end(addr, end); + if (!pud_large(*pud)) + kasan_populate_pud(pud, addr, next, nid); + } while (pud++, addr = next, addr != end); +} + +static void __init kasan_populate_pgd(pgd_t *pgd, unsigned long addr, + unsigned long end, int nid) +{ + void *p; + p4d_t *p4d; + unsigned long next; + + if (pgd_none(*pgd)) { + p = early_alloc(PAGE_SIZE, nid, true); + pgd_populate(&init_mm, pgd, p); + } + + p4d = p4d_offset(pgd, addr); + do { + next = p4d_addr_end(addr, end); + kasan_populate_p4d(p4d, addr, next, nid); + } while (p4d++, addr = next, addr != end); +} + +static void __init kasan_populate_shadow(unsigned long addr, unsigned long end, + int nid) +{ + pgd_t *pgd; + unsigned long next; + + addr = addr & PAGE_MASK; + end = round_up(end, PAGE_SIZE); + pgd = pgd_offset_k(addr); + do { + next = pgd_addr_end(addr, end); + kasan_populate_pgd(pgd, addr, next, nid); + } while (pgd++, addr = next, addr != end); +} + +static void __init map_range(struct range *range) +{ + unsigned long start; + unsigned long end; + + start = (unsigned long)kasan_mem_to_shadow(pfn_to_kaddr(range->start)); + end = (unsigned long)kasan_mem_to_shadow(pfn_to_kaddr(range->end)); + + kasan_populate_shadow(start, end, early_pfn_to_nid(range->start)); +} + +static void __init clear_pgds(unsigned long start, + unsigned long end) +{ + pgd_t *pgd; + /* See comment in kasan_init() */ + unsigned long pgd_end = end & PGDIR_MASK; + + for (; start < pgd_end; start += PGDIR_SIZE) { + pgd = pgd_offset_k(start); + /* + * With folded p4d, pgd_clear() is nop, use p4d_clear() + * instead. + */ + if (pgtable_l5_enabled()) + pgd_clear(pgd); + else + p4d_clear(p4d_offset(pgd, start)); + } + + pgd = pgd_offset_k(start); + for (; start < end; start += P4D_SIZE) + p4d_clear(p4d_offset(pgd, start)); +} + +static inline p4d_t *early_p4d_offset(pgd_t *pgd, unsigned long addr) +{ + unsigned long p4d; + + if (!pgtable_l5_enabled()) + return (p4d_t *)pgd; + + p4d = pgd_val(*pgd) & PTE_PFN_MASK; + p4d += __START_KERNEL_map - phys_base; + return (p4d_t *)p4d + p4d_index(addr); +} + +static void __init kasan_early_p4d_populate(pgd_t *pgd, + unsigned long addr, + unsigned long end) +{ + pgd_t pgd_entry; + p4d_t *p4d, p4d_entry; + unsigned long next; + + if (pgd_none(*pgd)) { + pgd_entry = __pgd(_KERNPG_TABLE | + __pa_nodebug(kasan_early_shadow_p4d)); + set_pgd(pgd, pgd_entry); + } + + p4d = early_p4d_offset(pgd, addr); + do { + next = p4d_addr_end(addr, end); + + if (!p4d_none(*p4d)) + continue; + + p4d_entry = __p4d(_KERNPG_TABLE | + __pa_nodebug(kasan_early_shadow_pud)); + set_p4d(p4d, p4d_entry); + } while (p4d++, addr = next, addr != end && p4d_none(*p4d)); +} + +static void __init kasan_map_early_shadow(pgd_t *pgd) +{ + /* See comment in kasan_init() */ + unsigned long addr = KASAN_SHADOW_START & PGDIR_MASK; + unsigned long end = KASAN_SHADOW_END; + unsigned long next; + + pgd += pgd_index(addr); + do { + next = pgd_addr_end(addr, end); + kasan_early_p4d_populate(pgd, addr, next); + } while (pgd++, addr = next, addr != end); +} + +static void __init kasan_shallow_populate_p4ds(pgd_t *pgd, + unsigned long addr, + unsigned long end) +{ + p4d_t *p4d; + unsigned long next; + void *p; + + p4d = p4d_offset(pgd, addr); + do { + next = p4d_addr_end(addr, end); + + if (p4d_none(*p4d)) { + p = early_alloc(PAGE_SIZE, NUMA_NO_NODE, true); + p4d_populate(&init_mm, p4d, p); + } + } while (p4d++, addr = next, addr != end); +} + +static void __init kasan_shallow_populate_pgds(void *start, void *end) +{ + unsigned long addr, next; + pgd_t *pgd; + void *p; + + addr = (unsigned long)start; + pgd = pgd_offset_k(addr); + do { + next = pgd_addr_end(addr, (unsigned long)end); + + if (pgd_none(*pgd)) { + p = early_alloc(PAGE_SIZE, NUMA_NO_NODE, true); + pgd_populate(&init_mm, pgd, p); + } + + /* + * we need to populate p4ds to be synced when running in + * four level mode - see sync_global_pgds_l4() + */ + kasan_shallow_populate_p4ds(pgd, addr, next); + } while (pgd++, addr = next, addr != (unsigned long)end); +} + +void __init kasan_early_init(void) +{ + int i; + pteval_t pte_val = __pa_nodebug(kasan_early_shadow_page) | + __PAGE_KERNEL | _PAGE_ENC; + pmdval_t pmd_val = __pa_nodebug(kasan_early_shadow_pte) | _KERNPG_TABLE; + pudval_t pud_val = __pa_nodebug(kasan_early_shadow_pmd) | _KERNPG_TABLE; + p4dval_t p4d_val = __pa_nodebug(kasan_early_shadow_pud) | _KERNPG_TABLE; + + /* Mask out unsupported __PAGE_KERNEL bits: */ + pte_val &= __default_kernel_pte_mask; + pmd_val &= __default_kernel_pte_mask; + pud_val &= __default_kernel_pte_mask; + p4d_val &= __default_kernel_pte_mask; + + for (i = 0; i < PTRS_PER_PTE; i++) + kasan_early_shadow_pte[i] = __pte(pte_val); + + for (i = 0; i < PTRS_PER_PMD; i++) + kasan_early_shadow_pmd[i] = __pmd(pmd_val); + + for (i = 0; i < PTRS_PER_PUD; i++) + kasan_early_shadow_pud[i] = __pud(pud_val); + + for (i = 0; pgtable_l5_enabled() && i < PTRS_PER_P4D; i++) + kasan_early_shadow_p4d[i] = __p4d(p4d_val); + + kasan_map_early_shadow(early_top_pgt); + kasan_map_early_shadow(init_top_pgt); +} + +static unsigned long kasan_mem_to_shadow_align_down(unsigned long va) +{ + unsigned long shadow = (unsigned long)kasan_mem_to_shadow((void *)va); + + return round_down(shadow, PAGE_SIZE); +} + +static unsigned long kasan_mem_to_shadow_align_up(unsigned long va) +{ + unsigned long shadow = (unsigned long)kasan_mem_to_shadow((void *)va); + + return round_up(shadow, PAGE_SIZE); +} + +void __init kasan_populate_shadow_for_vaddr(void *va, size_t size, int nid) +{ + unsigned long shadow_start, shadow_end; + + shadow_start = kasan_mem_to_shadow_align_down((unsigned long)va); + shadow_end = kasan_mem_to_shadow_align_up((unsigned long)va + size); + kasan_populate_shadow(shadow_start, shadow_end, nid); +} + +void __init kasan_init(void) +{ + unsigned long shadow_cea_begin, shadow_cea_per_cpu_begin, shadow_cea_end; + int i; + + memcpy(early_top_pgt, init_top_pgt, sizeof(early_top_pgt)); + + /* + * We use the same shadow offset for 4- and 5-level paging to + * facilitate boot-time switching between paging modes. + * As result in 5-level paging mode KASAN_SHADOW_START and + * KASAN_SHADOW_END are not aligned to PGD boundary. + * + * KASAN_SHADOW_START doesn't share PGD with anything else. + * We claim whole PGD entry to make things easier. + * + * KASAN_SHADOW_END lands in the last PGD entry and it collides with + * bunch of things like kernel code, modules, EFI mapping, etc. + * We need to take extra steps to not overwrite them. + */ + if (pgtable_l5_enabled()) { + void *ptr; + + ptr = (void *)pgd_page_vaddr(*pgd_offset_k(KASAN_SHADOW_END)); + memcpy(tmp_p4d_table, (void *)ptr, sizeof(tmp_p4d_table)); + set_pgd(&early_top_pgt[pgd_index(KASAN_SHADOW_END)], + __pgd(__pa(tmp_p4d_table) | _KERNPG_TABLE)); + } + + load_cr3(early_top_pgt); + __flush_tlb_all(); + + clear_pgds(KASAN_SHADOW_START & PGDIR_MASK, KASAN_SHADOW_END); + + kasan_populate_early_shadow((void *)(KASAN_SHADOW_START & PGDIR_MASK), + kasan_mem_to_shadow((void *)PAGE_OFFSET)); + + for (i = 0; i < E820_MAX_ENTRIES; i++) { + if (pfn_mapped[i].end == 0) + break; + + map_range(&pfn_mapped[i]); + } + + shadow_cea_begin = kasan_mem_to_shadow_align_down(CPU_ENTRY_AREA_BASE); + shadow_cea_per_cpu_begin = kasan_mem_to_shadow_align_up(CPU_ENTRY_AREA_PER_CPU); + shadow_cea_end = kasan_mem_to_shadow_align_up(CPU_ENTRY_AREA_BASE + + CPU_ENTRY_AREA_MAP_SIZE); + + kasan_populate_early_shadow( + kasan_mem_to_shadow((void *)PAGE_OFFSET + MAXMEM), + kasan_mem_to_shadow((void *)VMALLOC_START)); + + /* + * If we're in full vmalloc mode, don't back vmalloc space with early + * shadow pages. Instead, prepopulate pgds/p4ds so they are synced to + * the global table and we can populate the lower levels on demand. + */ + if (IS_ENABLED(CONFIG_KASAN_VMALLOC)) + kasan_shallow_populate_pgds( + kasan_mem_to_shadow((void *)VMALLOC_START), + kasan_mem_to_shadow((void *)VMALLOC_END)); + else + kasan_populate_early_shadow( + kasan_mem_to_shadow((void *)VMALLOC_START), + kasan_mem_to_shadow((void *)VMALLOC_END)); + + kasan_populate_early_shadow( + kasan_mem_to_shadow((void *)VMALLOC_END + 1), + (void *)shadow_cea_begin); + + /* + * Populate the shadow for the shared portion of the CPU entry area. + * Shadows for the per-CPU areas are mapped on-demand, as each CPU's + * area is randomly placed somewhere in the 512GiB range and mapping + * the entire 512GiB range is prohibitively expensive. + */ + kasan_populate_shadow(shadow_cea_begin, + shadow_cea_per_cpu_begin, 0); + + kasan_populate_early_shadow((void *)shadow_cea_end, + kasan_mem_to_shadow((void *)__START_KERNEL_map)); + + kasan_populate_shadow((unsigned long)kasan_mem_to_shadow(_stext), + (unsigned long)kasan_mem_to_shadow(_end), + early_pfn_to_nid(__pa(_stext))); + + kasan_populate_early_shadow(kasan_mem_to_shadow((void *)MODULES_END), + (void *)KASAN_SHADOW_END); + + load_cr3(init_top_pgt); + __flush_tlb_all(); + + /* + * kasan_early_shadow_page has been used as early shadow memory, thus + * it may contain some garbage. Now we can clear and write protect it, + * since after the TLB flush no one should write to it. + */ + memset(kasan_early_shadow_page, 0, PAGE_SIZE); + for (i = 0; i < PTRS_PER_PTE; i++) { + pte_t pte; + pgprot_t prot; + + prot = __pgprot(__PAGE_KERNEL_RO | _PAGE_ENC); + pgprot_val(prot) &= __default_kernel_pte_mask; + + pte = __pte(__pa(kasan_early_shadow_page) | pgprot_val(prot)); + set_pte(&kasan_early_shadow_pte[i], pte); + } + /* Flush TLBs again to be sure that write protection applied. */ + __flush_tlb_all(); + + init_task.kasan_depth = 0; + pr_info("KernelAddressSanitizer initialized\n"); +} diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c new file mode 100644 index 0000000000..37db264866 --- /dev/null +++ b/arch/x86/mm/kaslr.c @@ -0,0 +1,181 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * This file implements KASLR memory randomization for x86_64. It randomizes + * the virtual address space of kernel memory regions (physical memory + * mapping, vmalloc & vmemmap) for x86_64. This security feature mitigates + * exploits relying on predictable kernel addresses. + * + * Entropy is generated using the KASLR early boot functions now shared in + * the lib directory (originally written by Kees Cook). Randomization is + * done on PGD & P4D/PUD page table levels to increase possible addresses. + * The physical memory mapping code was adapted to support P4D/PUD level + * virtual addresses. This implementation on the best configuration provides + * 30,000 possible virtual addresses in average for each memory region. + * An additional low memory page is used to ensure each CPU can start with + * a PGD aligned virtual address (for realmode). + * + * The order of each memory region is not changed. The feature looks at + * the available space for the regions based on different configuration + * options and randomizes the base and space between each. The size of the + * physical memory mapping is the available physical memory. + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/random.h> +#include <linux/memblock.h> +#include <linux/pgtable.h> + +#include <asm/setup.h> +#include <asm/kaslr.h> + +#include "mm_internal.h" + +#define TB_SHIFT 40 + +/* + * The end address could depend on more configuration options to make the + * highest amount of space for randomization available, but that's too hard + * to keep straight and caused issues already. + */ +static const unsigned long vaddr_end = CPU_ENTRY_AREA_BASE; + +/* + * Memory regions randomized by KASLR (except modules that use a separate logic + * earlier during boot). The list is ordered based on virtual addresses. This + * order is kept after randomization. + */ +static __initdata struct kaslr_memory_region { + unsigned long *base; + unsigned long size_tb; +} kaslr_regions[] = { + { &page_offset_base, 0 }, + { &vmalloc_base, 0 }, + { &vmemmap_base, 0 }, +}; + +/* Get size in bytes used by the memory region */ +static inline unsigned long get_padding(struct kaslr_memory_region *region) +{ + return (region->size_tb << TB_SHIFT); +} + +/* Initialize base and padding for each memory region randomized with KASLR */ +void __init kernel_randomize_memory(void) +{ + size_t i; + unsigned long vaddr_start, vaddr; + unsigned long rand, memory_tb; + struct rnd_state rand_state; + unsigned long remain_entropy; + unsigned long vmemmap_size; + + vaddr_start = pgtable_l5_enabled() ? __PAGE_OFFSET_BASE_L5 : __PAGE_OFFSET_BASE_L4; + vaddr = vaddr_start; + + /* + * These BUILD_BUG_ON checks ensure the memory layout is consistent + * with the vaddr_start/vaddr_end variables. These checks are very + * limited.... + */ + BUILD_BUG_ON(vaddr_start >= vaddr_end); + BUILD_BUG_ON(vaddr_end != CPU_ENTRY_AREA_BASE); + BUILD_BUG_ON(vaddr_end > __START_KERNEL_map); + + if (!kaslr_memory_enabled()) + return; + + kaslr_regions[0].size_tb = 1 << (MAX_PHYSMEM_BITS - TB_SHIFT); + kaslr_regions[1].size_tb = VMALLOC_SIZE_TB; + + /* + * Update Physical memory mapping to available and + * add padding if needed (especially for memory hotplug support). + */ + BUG_ON(kaslr_regions[0].base != &page_offset_base); + memory_tb = DIV_ROUND_UP(max_pfn << PAGE_SHIFT, 1UL << TB_SHIFT) + + CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING; + + /* Adapt physical memory region size based on available memory */ + if (memory_tb < kaslr_regions[0].size_tb) + kaslr_regions[0].size_tb = memory_tb; + + /* + * Calculate the vmemmap region size in TBs, aligned to a TB + * boundary. + */ + vmemmap_size = (kaslr_regions[0].size_tb << (TB_SHIFT - PAGE_SHIFT)) * + sizeof(struct page); + kaslr_regions[2].size_tb = DIV_ROUND_UP(vmemmap_size, 1UL << TB_SHIFT); + + /* Calculate entropy available between regions */ + remain_entropy = vaddr_end - vaddr_start; + for (i = 0; i < ARRAY_SIZE(kaslr_regions); i++) + remain_entropy -= get_padding(&kaslr_regions[i]); + + prandom_seed_state(&rand_state, kaslr_get_random_long("Memory")); + + for (i = 0; i < ARRAY_SIZE(kaslr_regions); i++) { + unsigned long entropy; + + /* + * Select a random virtual address using the extra entropy + * available. + */ + entropy = remain_entropy / (ARRAY_SIZE(kaslr_regions) - i); + prandom_bytes_state(&rand_state, &rand, sizeof(rand)); + entropy = (rand % (entropy + 1)) & PUD_MASK; + vaddr += entropy; + *kaslr_regions[i].base = vaddr; + + /* + * Jump the region and add a minimum padding based on + * randomization alignment. + */ + vaddr += get_padding(&kaslr_regions[i]); + vaddr = round_up(vaddr + 1, PUD_SIZE); + remain_entropy -= entropy; + } +} + +void __meminit init_trampoline_kaslr(void) +{ + pud_t *pud_page_tramp, *pud, *pud_tramp; + p4d_t *p4d_page_tramp, *p4d, *p4d_tramp; + unsigned long paddr, vaddr; + pgd_t *pgd; + + pud_page_tramp = alloc_low_page(); + + /* + * There are two mappings for the low 1MB area, the direct mapping + * and the 1:1 mapping for the real mode trampoline: + * + * Direct mapping: virt_addr = phys_addr + PAGE_OFFSET + * 1:1 mapping: virt_addr = phys_addr + */ + paddr = 0; + vaddr = (unsigned long)__va(paddr); + pgd = pgd_offset_k(vaddr); + + p4d = p4d_offset(pgd, vaddr); + pud = pud_offset(p4d, vaddr); + + pud_tramp = pud_page_tramp + pud_index(paddr); + *pud_tramp = *pud; + + if (pgtable_l5_enabled()) { + p4d_page_tramp = alloc_low_page(); + + p4d_tramp = p4d_page_tramp + p4d_index(paddr); + + set_p4d(p4d_tramp, + __p4d(_KERNPG_TABLE | __pa(pud_page_tramp))); + + trampoline_pgd_entry = + __pgd(_KERNPG_TABLE | __pa(p4d_page_tramp)); + } else { + trampoline_pgd_entry = + __pgd(_KERNPG_TABLE | __pa(pud_page_tramp)); + } +} diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c new file mode 100644 index 0000000000..9f82019179 --- /dev/null +++ b/arch/x86/mm/kmmio.c @@ -0,0 +1,632 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Support for MMIO probes. + * Benefit many code from kprobes + * (C) 2002 Louis Zhuang <louis.zhuang@intel.com>. + * 2007 Alexander Eichner + * 2008 Pekka Paalanen <pq@iki.fi> + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/list.h> +#include <linux/rculist.h> +#include <linux/spinlock.h> +#include <linux/hash.h> +#include <linux/export.h> +#include <linux/kernel.h> +#include <linux/uaccess.h> +#include <linux/ptrace.h> +#include <linux/preempt.h> +#include <linux/percpu.h> +#include <linux/kdebug.h> +#include <linux/mutex.h> +#include <linux/io.h> +#include <linux/slab.h> +#include <asm/cacheflush.h> +#include <asm/tlbflush.h> +#include <linux/errno.h> +#include <asm/debugreg.h> +#include <linux/mmiotrace.h> + +#define KMMIO_PAGE_HASH_BITS 4 +#define KMMIO_PAGE_TABLE_SIZE (1 << KMMIO_PAGE_HASH_BITS) + +struct kmmio_fault_page { + struct list_head list; + struct kmmio_fault_page *release_next; + unsigned long addr; /* the requested address */ + pteval_t old_presence; /* page presence prior to arming */ + bool armed; + + /* + * Number of times this page has been registered as a part + * of a probe. If zero, page is disarmed and this may be freed. + * Used only by writers (RCU) and post_kmmio_handler(). + * Protected by kmmio_lock, when linked into kmmio_page_table. + */ + int count; + + bool scheduled_for_release; +}; + +struct kmmio_delayed_release { + struct rcu_head rcu; + struct kmmio_fault_page *release_list; +}; + +struct kmmio_context { + struct kmmio_fault_page *fpage; + struct kmmio_probe *probe; + unsigned long saved_flags; + unsigned long addr; + int active; +}; + +/* + * The kmmio_lock is taken in int3 context, which is treated as NMI context. + * This causes lockdep to complain about it bein in both NMI and normal + * context. Hide it from lockdep, as it should not have any other locks + * taken under it, and this is only enabled for debugging mmio anyway. + */ +static arch_spinlock_t kmmio_lock = __ARCH_SPIN_LOCK_UNLOCKED; + +/* Protected by kmmio_lock */ +unsigned int kmmio_count; + +/* Read-protected by RCU, write-protected by kmmio_lock. */ +static struct list_head kmmio_page_table[KMMIO_PAGE_TABLE_SIZE]; +static LIST_HEAD(kmmio_probes); + +static struct list_head *kmmio_page_list(unsigned long addr) +{ + unsigned int l; + pte_t *pte = lookup_address(addr, &l); + + if (!pte) + return NULL; + addr &= page_level_mask(l); + + return &kmmio_page_table[hash_long(addr, KMMIO_PAGE_HASH_BITS)]; +} + +/* Accessed per-cpu */ +static DEFINE_PER_CPU(struct kmmio_context, kmmio_ctx); + +/* + * this is basically a dynamic stabbing problem: + * Could use the existing prio tree code or + * Possible better implementations: + * The Interval Skip List: A Data Structure for Finding All Intervals That + * Overlap a Point (might be simple) + * Space Efficient Dynamic Stabbing with Fast Queries - Mikkel Thorup + */ +/* Get the kmmio at this addr (if any). You must be holding RCU read lock. */ +static struct kmmio_probe *get_kmmio_probe(unsigned long addr) +{ + struct kmmio_probe *p; + list_for_each_entry_rcu(p, &kmmio_probes, list) { + if (addr >= p->addr && addr < (p->addr + p->len)) + return p; + } + return NULL; +} + +/* You must be holding RCU read lock. */ +static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long addr) +{ + struct list_head *head; + struct kmmio_fault_page *f; + unsigned int l; + pte_t *pte = lookup_address(addr, &l); + + if (!pte) + return NULL; + addr &= page_level_mask(l); + head = kmmio_page_list(addr); + list_for_each_entry_rcu(f, head, list) { + if (f->addr == addr) + return f; + } + return NULL; +} + +static void clear_pmd_presence(pmd_t *pmd, bool clear, pmdval_t *old) +{ + pmd_t new_pmd; + pmdval_t v = pmd_val(*pmd); + if (clear) { + *old = v; + new_pmd = pmd_mkinvalid(*pmd); + } else { + /* Presume this has been called with clear==true previously */ + new_pmd = __pmd(*old); + } + set_pmd(pmd, new_pmd); +} + +static void clear_pte_presence(pte_t *pte, bool clear, pteval_t *old) +{ + pteval_t v = pte_val(*pte); + if (clear) { + *old = v; + /* Nothing should care about address */ + pte_clear(&init_mm, 0, pte); + } else { + /* Presume this has been called with clear==true previously */ + set_pte_atomic(pte, __pte(*old)); + } +} + +static int clear_page_presence(struct kmmio_fault_page *f, bool clear) +{ + unsigned int level; + pte_t *pte = lookup_address(f->addr, &level); + + if (!pte) { + pr_err("no pte for addr 0x%08lx\n", f->addr); + return -1; + } + + switch (level) { + case PG_LEVEL_2M: + clear_pmd_presence((pmd_t *)pte, clear, &f->old_presence); + break; + case PG_LEVEL_4K: + clear_pte_presence(pte, clear, &f->old_presence); + break; + default: + pr_err("unexpected page level 0x%x.\n", level); + return -1; + } + + flush_tlb_one_kernel(f->addr); + return 0; +} + +/* + * Mark the given page as not present. Access to it will trigger a fault. + * + * Struct kmmio_fault_page is protected by RCU and kmmio_lock, but the + * protection is ignored here. RCU read lock is assumed held, so the struct + * will not disappear unexpectedly. Furthermore, the caller must guarantee, + * that double arming the same virtual address (page) cannot occur. + * + * Double disarming on the other hand is allowed, and may occur when a fault + * and mmiotrace shutdown happen simultaneously. + */ +static int arm_kmmio_fault_page(struct kmmio_fault_page *f) +{ + int ret; + WARN_ONCE(f->armed, KERN_ERR pr_fmt("kmmio page already armed.\n")); + if (f->armed) { + pr_warn("double-arm: addr 0x%08lx, ref %d, old %d\n", + f->addr, f->count, !!f->old_presence); + } + ret = clear_page_presence(f, true); + WARN_ONCE(ret < 0, KERN_ERR pr_fmt("arming at 0x%08lx failed.\n"), + f->addr); + f->armed = true; + return ret; +} + +/** Restore the given page to saved presence state. */ +static void disarm_kmmio_fault_page(struct kmmio_fault_page *f) +{ + int ret = clear_page_presence(f, false); + WARN_ONCE(ret < 0, + KERN_ERR "kmmio disarming at 0x%08lx failed.\n", f->addr); + f->armed = false; +} + +/* + * This is being called from do_page_fault(). + * + * We may be in an interrupt or a critical section. Also prefecthing may + * trigger a page fault. We may be in the middle of process switch. + * We cannot take any locks, because we could be executing especially + * within a kmmio critical section. + * + * Local interrupts are disabled, so preemption cannot happen. + * Do not enable interrupts, do not sleep, and watch out for other CPUs. + */ +/* + * Interrupts are disabled on entry as trap3 is an interrupt gate + * and they remain disabled throughout this function. + */ +int kmmio_handler(struct pt_regs *regs, unsigned long addr) +{ + struct kmmio_context *ctx; + struct kmmio_fault_page *faultpage; + int ret = 0; /* default to fault not handled */ + unsigned long page_base = addr; + unsigned int l; + pte_t *pte = lookup_address(addr, &l); + if (!pte) + return -EINVAL; + page_base &= page_level_mask(l); + + /* + * Hold the RCU read lock over single stepping to avoid looking + * up the probe and kmmio_fault_page again. The rcu_read_lock_sched() + * also disables preemption and prevents process switch during + * the single stepping. We can only handle one active kmmio trace + * per cpu, so ensure that we finish it before something else + * gets to run. + */ + rcu_read_lock_sched_notrace(); + + faultpage = get_kmmio_fault_page(page_base); + if (!faultpage) { + /* + * Either this page fault is not caused by kmmio, or + * another CPU just pulled the kmmio probe from under + * our feet. The latter case should not be possible. + */ + goto no_kmmio; + } + + ctx = this_cpu_ptr(&kmmio_ctx); + if (ctx->active) { + if (page_base == ctx->addr) { + /* + * A second fault on the same page means some other + * condition needs handling by do_page_fault(), the + * page really not being present is the most common. + */ + pr_debug("secondary hit for 0x%08lx CPU %d.\n", + addr, smp_processor_id()); + + if (!faultpage->old_presence) + pr_info("unexpected secondary hit for address 0x%08lx on CPU %d.\n", + addr, smp_processor_id()); + } else { + /* + * Prevent overwriting already in-flight context. + * This should not happen, let's hope disarming at + * least prevents a panic. + */ + pr_emerg("recursive probe hit on CPU %d, for address 0x%08lx. Ignoring.\n", + smp_processor_id(), addr); + pr_emerg("previous hit was at 0x%08lx.\n", ctx->addr); + disarm_kmmio_fault_page(faultpage); + } + goto no_kmmio; + } + ctx->active++; + + ctx->fpage = faultpage; + ctx->probe = get_kmmio_probe(page_base); + ctx->saved_flags = (regs->flags & (X86_EFLAGS_TF | X86_EFLAGS_IF)); + ctx->addr = page_base; + + if (ctx->probe && ctx->probe->pre_handler) + ctx->probe->pre_handler(ctx->probe, regs, addr); + + /* + * Enable single-stepping and disable interrupts for the faulting + * context. Local interrupts must not get enabled during stepping. + */ + regs->flags |= X86_EFLAGS_TF; + regs->flags &= ~X86_EFLAGS_IF; + + /* Now we set present bit in PTE and single step. */ + disarm_kmmio_fault_page(ctx->fpage); + + /* + * If another cpu accesses the same page while we are stepping, + * the access will not be caught. It will simply succeed and the + * only downside is we lose the event. If this becomes a problem, + * the user should drop to single cpu before tracing. + */ + + return 1; /* fault handled */ + +no_kmmio: + rcu_read_unlock_sched_notrace(); + return ret; +} + +/* + * Interrupts are disabled on entry as trap1 is an interrupt gate + * and they remain disabled throughout this function. + * This must always get called as the pair to kmmio_handler(). + */ +static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs) +{ + int ret = 0; + struct kmmio_context *ctx = this_cpu_ptr(&kmmio_ctx); + + if (!ctx->active) { + /* + * debug traps without an active context are due to either + * something external causing them (f.e. using a debugger while + * mmio tracing enabled), or erroneous behaviour + */ + pr_warn("unexpected debug trap on CPU %d.\n", smp_processor_id()); + goto out; + } + + if (ctx->probe && ctx->probe->post_handler) + ctx->probe->post_handler(ctx->probe, condition, regs); + + /* Prevent racing against release_kmmio_fault_page(). */ + arch_spin_lock(&kmmio_lock); + if (ctx->fpage->count) + arm_kmmio_fault_page(ctx->fpage); + arch_spin_unlock(&kmmio_lock); + + regs->flags &= ~X86_EFLAGS_TF; + regs->flags |= ctx->saved_flags; + + /* These were acquired in kmmio_handler(). */ + ctx->active--; + BUG_ON(ctx->active); + rcu_read_unlock_sched_notrace(); + + /* + * if somebody else is singlestepping across a probe point, flags + * will have TF set, in which case, continue the remaining processing + * of do_debug, as if this is not a probe hit. + */ + if (!(regs->flags & X86_EFLAGS_TF)) + ret = 1; +out: + return ret; +} + +/* You must be holding kmmio_lock. */ +static int add_kmmio_fault_page(unsigned long addr) +{ + struct kmmio_fault_page *f; + + f = get_kmmio_fault_page(addr); + if (f) { + if (!f->count) + arm_kmmio_fault_page(f); + f->count++; + return 0; + } + + f = kzalloc(sizeof(*f), GFP_ATOMIC); + if (!f) + return -1; + + f->count = 1; + f->addr = addr; + + if (arm_kmmio_fault_page(f)) { + kfree(f); + return -1; + } + + list_add_rcu(&f->list, kmmio_page_list(f->addr)); + + return 0; +} + +/* You must be holding kmmio_lock. */ +static void release_kmmio_fault_page(unsigned long addr, + struct kmmio_fault_page **release_list) +{ + struct kmmio_fault_page *f; + + f = get_kmmio_fault_page(addr); + if (!f) + return; + + f->count--; + BUG_ON(f->count < 0); + if (!f->count) { + disarm_kmmio_fault_page(f); + if (!f->scheduled_for_release) { + f->release_next = *release_list; + *release_list = f; + f->scheduled_for_release = true; + } + } +} + +/* + * With page-unaligned ioremaps, one or two armed pages may contain + * addresses from outside the intended mapping. Events for these addresses + * are currently silently dropped. The events may result only from programming + * mistakes by accessing addresses before the beginning or past the end of a + * mapping. + */ +int register_kmmio_probe(struct kmmio_probe *p) +{ + unsigned long flags; + int ret = 0; + unsigned long size = 0; + unsigned long addr = p->addr & PAGE_MASK; + const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK); + unsigned int l; + pte_t *pte; + + local_irq_save(flags); + arch_spin_lock(&kmmio_lock); + if (get_kmmio_probe(addr)) { + ret = -EEXIST; + goto out; + } + + pte = lookup_address(addr, &l); + if (!pte) { + ret = -EINVAL; + goto out; + } + + kmmio_count++; + list_add_rcu(&p->list, &kmmio_probes); + while (size < size_lim) { + if (add_kmmio_fault_page(addr + size)) + pr_err("Unable to set page fault.\n"); + size += page_level_size(l); + } +out: + arch_spin_unlock(&kmmio_lock); + local_irq_restore(flags); + + /* + * XXX: What should I do here? + * Here was a call to global_flush_tlb(), but it does not exist + * anymore. It seems it's not needed after all. + */ + return ret; +} +EXPORT_SYMBOL(register_kmmio_probe); + +static void rcu_free_kmmio_fault_pages(struct rcu_head *head) +{ + struct kmmio_delayed_release *dr = container_of( + head, + struct kmmio_delayed_release, + rcu); + struct kmmio_fault_page *f = dr->release_list; + while (f) { + struct kmmio_fault_page *next = f->release_next; + BUG_ON(f->count); + kfree(f); + f = next; + } + kfree(dr); +} + +static void remove_kmmio_fault_pages(struct rcu_head *head) +{ + struct kmmio_delayed_release *dr = + container_of(head, struct kmmio_delayed_release, rcu); + struct kmmio_fault_page *f = dr->release_list; + struct kmmio_fault_page **prevp = &dr->release_list; + unsigned long flags; + + local_irq_save(flags); + arch_spin_lock(&kmmio_lock); + while (f) { + if (!f->count) { + list_del_rcu(&f->list); + prevp = &f->release_next; + } else { + *prevp = f->release_next; + f->release_next = NULL; + f->scheduled_for_release = false; + } + f = *prevp; + } + arch_spin_unlock(&kmmio_lock); + local_irq_restore(flags); + + /* This is the real RCU destroy call. */ + call_rcu(&dr->rcu, rcu_free_kmmio_fault_pages); +} + +/* + * Remove a kmmio probe. You have to synchronize_rcu() before you can be + * sure that the callbacks will not be called anymore. Only after that + * you may actually release your struct kmmio_probe. + * + * Unregistering a kmmio fault page has three steps: + * 1. release_kmmio_fault_page() + * Disarm the page, wait a grace period to let all faults finish. + * 2. remove_kmmio_fault_pages() + * Remove the pages from kmmio_page_table. + * 3. rcu_free_kmmio_fault_pages() + * Actually free the kmmio_fault_page structs as with RCU. + */ +void unregister_kmmio_probe(struct kmmio_probe *p) +{ + unsigned long flags; + unsigned long size = 0; + unsigned long addr = p->addr & PAGE_MASK; + const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK); + struct kmmio_fault_page *release_list = NULL; + struct kmmio_delayed_release *drelease; + unsigned int l; + pte_t *pte; + + pte = lookup_address(addr, &l); + if (!pte) + return; + + local_irq_save(flags); + arch_spin_lock(&kmmio_lock); + while (size < size_lim) { + release_kmmio_fault_page(addr + size, &release_list); + size += page_level_size(l); + } + list_del_rcu(&p->list); + kmmio_count--; + arch_spin_unlock(&kmmio_lock); + local_irq_restore(flags); + + if (!release_list) + return; + + drelease = kmalloc(sizeof(*drelease), GFP_ATOMIC); + if (!drelease) { + pr_crit("leaking kmmio_fault_page objects.\n"); + return; + } + drelease->release_list = release_list; + + /* + * This is not really RCU here. We have just disarmed a set of + * pages so that they cannot trigger page faults anymore. However, + * we cannot remove the pages from kmmio_page_table, + * because a probe hit might be in flight on another CPU. The + * pages are collected into a list, and they will be removed from + * kmmio_page_table when it is certain that no probe hit related to + * these pages can be in flight. RCU grace period sounds like a + * good choice. + * + * If we removed the pages too early, kmmio page fault handler might + * not find the respective kmmio_fault_page and determine it's not + * a kmmio fault, when it actually is. This would lead to madness. + */ + call_rcu(&drelease->rcu, remove_kmmio_fault_pages); +} +EXPORT_SYMBOL(unregister_kmmio_probe); + +static int +kmmio_die_notifier(struct notifier_block *nb, unsigned long val, void *args) +{ + struct die_args *arg = args; + unsigned long* dr6_p = (unsigned long *)ERR_PTR(arg->err); + + if (val == DIE_DEBUG && (*dr6_p & DR_STEP)) + if (post_kmmio_handler(*dr6_p, arg->regs) == 1) { + /* + * Reset the BS bit in dr6 (pointed by args->err) to + * denote completion of processing + */ + *dr6_p &= ~DR_STEP; + return NOTIFY_STOP; + } + + return NOTIFY_DONE; +} + +static struct notifier_block nb_die = { + .notifier_call = kmmio_die_notifier +}; + +int kmmio_init(void) +{ + int i; + + for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++) + INIT_LIST_HEAD(&kmmio_page_table[i]); + + return register_die_notifier(&nb_die); +} + +void kmmio_cleanup(void) +{ + int i; + + unregister_die_notifier(&nb_die); + for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++) { + WARN_ONCE(!list_empty(&kmmio_page_table[i]), + KERN_ERR "kmmio_page_table not empty at cleanup, any further tracing will leak memory.\n"); + } +} diff --git a/arch/x86/mm/kmsan_shadow.c b/arch/x86/mm/kmsan_shadow.c new file mode 100644 index 0000000000..bee2ec4a3b --- /dev/null +++ b/arch/x86/mm/kmsan_shadow.c @@ -0,0 +1,20 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * x86-specific bits of KMSAN shadow implementation. + * + * Copyright (C) 2022 Google LLC + * Author: Alexander Potapenko <glider@google.com> + */ + +#include <asm/cpu_entry_area.h> +#include <linux/percpu-defs.h> + +/* + * Addresses within the CPU entry area (including e.g. exception stacks) do not + * have struct page entries corresponding to them, so they need separate + * handling. + * arch_kmsan_get_meta_or_null() (declared in the header) maps the addresses in + * CPU entry area to addresses in cpu_entry_area_shadow/cpu_entry_area_origin. + */ +DEFINE_PER_CPU(char[CPU_ENTRY_AREA_SIZE], cpu_entry_area_shadow); +DEFINE_PER_CPU(char[CPU_ENTRY_AREA_SIZE], cpu_entry_area_origin); diff --git a/arch/x86/mm/maccess.c b/arch/x86/mm/maccess.c new file mode 100644 index 0000000000..6993f026ad --- /dev/null +++ b/arch/x86/mm/maccess.c @@ -0,0 +1,33 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <linux/uaccess.h> +#include <linux/kernel.h> + +#ifdef CONFIG_X86_64 +bool copy_from_kernel_nofault_allowed(const void *unsafe_src, size_t size) +{ + unsigned long vaddr = (unsigned long)unsafe_src; + + /* + * Do not allow userspace addresses. This disallows + * normal userspace and the userspace guard page: + */ + if (vaddr < TASK_SIZE_MAX + PAGE_SIZE) + return false; + + /* + * Allow everything during early boot before 'x86_virt_bits' + * is initialized. Needed for instruction decoding in early + * exception handlers. + */ + if (!boot_cpu_data.x86_virt_bits) + return true; + + return __is_canonical_address(vaddr, boot_cpu_data.x86_virt_bits); +} +#else +bool copy_from_kernel_nofault_allowed(const void *unsafe_src, size_t size) +{ + return (unsigned long)unsafe_src >= TASK_SIZE_MAX; +} +#endif diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c new file mode 100644 index 0000000000..9f27e14e18 --- /dev/null +++ b/arch/x86/mm/mem_encrypt.c @@ -0,0 +1,88 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Memory Encryption Support Common Code + * + * Copyright (C) 2016 Advanced Micro Devices, Inc. + * + * Author: Tom Lendacky <thomas.lendacky@amd.com> + */ + +#include <linux/dma-direct.h> +#include <linux/dma-mapping.h> +#include <linux/swiotlb.h> +#include <linux/cc_platform.h> +#include <linux/mem_encrypt.h> + +/* Override for DMA direct allocation check - ARCH_HAS_FORCE_DMA_UNENCRYPTED */ +bool force_dma_unencrypted(struct device *dev) +{ + /* + * For SEV, all DMA must be to unencrypted addresses. + */ + if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) + return true; + + /* + * For SME, all DMA must be to unencrypted addresses if the + * device does not support DMA to addresses that include the + * encryption mask. + */ + if (cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) { + u64 dma_enc_mask = DMA_BIT_MASK(__ffs64(sme_me_mask)); + u64 dma_dev_mask = min_not_zero(dev->coherent_dma_mask, + dev->bus_dma_limit); + + if (dma_dev_mask <= dma_enc_mask) + return true; + } + + return false; +} + +static void print_mem_encrypt_feature_info(void) +{ + pr_info("Memory Encryption Features active:"); + + if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) { + pr_cont(" Intel TDX\n"); + return; + } + + pr_cont(" AMD"); + + /* Secure Memory Encryption */ + if (cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) { + /* + * SME is mutually exclusive with any of the SEV + * features below. + */ + pr_cont(" SME\n"); + return; + } + + /* Secure Encrypted Virtualization */ + if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) + pr_cont(" SEV"); + + /* Encrypted Register State */ + if (cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT)) + pr_cont(" SEV-ES"); + + /* Secure Nested Paging */ + if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) + pr_cont(" SEV-SNP"); + + pr_cont("\n"); +} + +/* Architecture __weak replacement functions */ +void __init mem_encrypt_init(void) +{ + if (!cc_platform_has(CC_ATTR_MEM_ENCRYPT)) + return; + + /* Call into SWIOTLB to update the SWIOTLB DMA buffers */ + swiotlb_update_mem_attributes(); + + print_mem_encrypt_feature_info(); +} diff --git a/arch/x86/mm/mem_encrypt_amd.c b/arch/x86/mm/mem_encrypt_amd.c new file mode 100644 index 0000000000..45ff95264a --- /dev/null +++ b/arch/x86/mm/mem_encrypt_amd.c @@ -0,0 +1,559 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * AMD Memory Encryption Support + * + * Copyright (C) 2016 Advanced Micro Devices, Inc. + * + * Author: Tom Lendacky <thomas.lendacky@amd.com> + */ + +#define DISABLE_BRANCH_PROFILING + +#include <linux/linkage.h> +#include <linux/init.h> +#include <linux/mm.h> +#include <linux/dma-direct.h> +#include <linux/swiotlb.h> +#include <linux/mem_encrypt.h> +#include <linux/device.h> +#include <linux/kernel.h> +#include <linux/bitops.h> +#include <linux/dma-mapping.h> +#include <linux/virtio_config.h> +#include <linux/virtio_anchor.h> +#include <linux/cc_platform.h> + +#include <asm/tlbflush.h> +#include <asm/fixmap.h> +#include <asm/setup.h> +#include <asm/mem_encrypt.h> +#include <asm/bootparam.h> +#include <asm/set_memory.h> +#include <asm/cacheflush.h> +#include <asm/processor-flags.h> +#include <asm/msr.h> +#include <asm/cmdline.h> +#include <asm/sev.h> +#include <asm/ia32.h> + +#include "mm_internal.h" + +/* + * Since SME related variables are set early in the boot process they must + * reside in the .data section so as not to be zeroed out when the .bss + * section is later cleared. + */ +u64 sme_me_mask __section(".data") = 0; +u64 sev_status __section(".data") = 0; +u64 sev_check_data __section(".data") = 0; +EXPORT_SYMBOL(sme_me_mask); + +/* Buffer used for early in-place encryption by BSP, no locking needed */ +static char sme_early_buffer[PAGE_SIZE] __initdata __aligned(PAGE_SIZE); + +/* + * SNP-specific routine which needs to additionally change the page state from + * private to shared before copying the data from the source to destination and + * restore after the copy. + */ +static inline void __init snp_memcpy(void *dst, void *src, size_t sz, + unsigned long paddr, bool decrypt) +{ + unsigned long npages = PAGE_ALIGN(sz) >> PAGE_SHIFT; + + if (decrypt) { + /* + * @paddr needs to be accessed decrypted, mark the page shared in + * the RMP table before copying it. + */ + early_snp_set_memory_shared((unsigned long)__va(paddr), paddr, npages); + + memcpy(dst, src, sz); + + /* Restore the page state after the memcpy. */ + early_snp_set_memory_private((unsigned long)__va(paddr), paddr, npages); + } else { + /* + * @paddr need to be accessed encrypted, no need for the page state + * change. + */ + memcpy(dst, src, sz); + } +} + +/* + * This routine does not change the underlying encryption setting of the + * page(s) that map this memory. It assumes that eventually the memory is + * meant to be accessed as either encrypted or decrypted but the contents + * are currently not in the desired state. + * + * This routine follows the steps outlined in the AMD64 Architecture + * Programmer's Manual Volume 2, Section 7.10.8 Encrypt-in-Place. + */ +static void __init __sme_early_enc_dec(resource_size_t paddr, + unsigned long size, bool enc) +{ + void *src, *dst; + size_t len; + + if (!sme_me_mask) + return; + + wbinvd(); + + /* + * There are limited number of early mapping slots, so map (at most) + * one page at time. + */ + while (size) { + len = min_t(size_t, sizeof(sme_early_buffer), size); + + /* + * Create mappings for the current and desired format of + * the memory. Use a write-protected mapping for the source. + */ + src = enc ? early_memremap_decrypted_wp(paddr, len) : + early_memremap_encrypted_wp(paddr, len); + + dst = enc ? early_memremap_encrypted(paddr, len) : + early_memremap_decrypted(paddr, len); + + /* + * If a mapping can't be obtained to perform the operation, + * then eventual access of that area in the desired mode + * will cause a crash. + */ + BUG_ON(!src || !dst); + + /* + * Use a temporary buffer, of cache-line multiple size, to + * avoid data corruption as documented in the APM. + */ + if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) { + snp_memcpy(sme_early_buffer, src, len, paddr, enc); + snp_memcpy(dst, sme_early_buffer, len, paddr, !enc); + } else { + memcpy(sme_early_buffer, src, len); + memcpy(dst, sme_early_buffer, len); + } + + early_memunmap(dst, len); + early_memunmap(src, len); + + paddr += len; + size -= len; + } +} + +void __init sme_early_encrypt(resource_size_t paddr, unsigned long size) +{ + __sme_early_enc_dec(paddr, size, true); +} + +void __init sme_early_decrypt(resource_size_t paddr, unsigned long size) +{ + __sme_early_enc_dec(paddr, size, false); +} + +static void __init __sme_early_map_unmap_mem(void *vaddr, unsigned long size, + bool map) +{ + unsigned long paddr = (unsigned long)vaddr - __PAGE_OFFSET; + pmdval_t pmd_flags, pmd; + + /* Use early_pmd_flags but remove the encryption mask */ + pmd_flags = __sme_clr(early_pmd_flags); + + do { + pmd = map ? (paddr & PMD_MASK) + pmd_flags : 0; + __early_make_pgtable((unsigned long)vaddr, pmd); + + vaddr += PMD_SIZE; + paddr += PMD_SIZE; + size = (size <= PMD_SIZE) ? 0 : size - PMD_SIZE; + } while (size); + + flush_tlb_local(); +} + +void __init sme_unmap_bootdata(char *real_mode_data) +{ + struct boot_params *boot_data; + unsigned long cmdline_paddr; + + if (!cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) + return; + + /* Get the command line address before unmapping the real_mode_data */ + boot_data = (struct boot_params *)real_mode_data; + cmdline_paddr = boot_data->hdr.cmd_line_ptr | ((u64)boot_data->ext_cmd_line_ptr << 32); + + __sme_early_map_unmap_mem(real_mode_data, sizeof(boot_params), false); + + if (!cmdline_paddr) + return; + + __sme_early_map_unmap_mem(__va(cmdline_paddr), COMMAND_LINE_SIZE, false); +} + +void __init sme_map_bootdata(char *real_mode_data) +{ + struct boot_params *boot_data; + unsigned long cmdline_paddr; + + if (!cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) + return; + + __sme_early_map_unmap_mem(real_mode_data, sizeof(boot_params), true); + + /* Get the command line address after mapping the real_mode_data */ + boot_data = (struct boot_params *)real_mode_data; + cmdline_paddr = boot_data->hdr.cmd_line_ptr | ((u64)boot_data->ext_cmd_line_ptr << 32); + + if (!cmdline_paddr) + return; + + __sme_early_map_unmap_mem(__va(cmdline_paddr), COMMAND_LINE_SIZE, true); +} + +void __init sev_setup_arch(void) +{ + phys_addr_t total_mem = memblock_phys_mem_size(); + unsigned long size; + + if (!cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) + return; + + /* + * For SEV, all DMA has to occur via shared/unencrypted pages. + * SEV uses SWIOTLB to make this happen without changing device + * drivers. However, depending on the workload being run, the + * default 64MB of SWIOTLB may not be enough and SWIOTLB may + * run out of buffers for DMA, resulting in I/O errors and/or + * performance degradation especially with high I/O workloads. + * + * Adjust the default size of SWIOTLB for SEV guests using + * a percentage of guest memory for SWIOTLB buffers. + * Also, as the SWIOTLB bounce buffer memory is allocated + * from low memory, ensure that the adjusted size is within + * the limits of low available memory. + * + * The percentage of guest memory used here for SWIOTLB buffers + * is more of an approximation of the static adjustment which + * 64MB for <1G, and ~128M to 256M for 1G-to-4G, i.e., the 6% + */ + size = total_mem * 6 / 100; + size = clamp_val(size, IO_TLB_DEFAULT_SIZE, SZ_1G); + swiotlb_adjust_size(size); + + /* Set restricted memory access for virtio. */ + virtio_set_mem_acc_cb(virtio_require_restricted_mem_acc); +} + +static unsigned long pg_level_to_pfn(int level, pte_t *kpte, pgprot_t *ret_prot) +{ + unsigned long pfn = 0; + pgprot_t prot; + + switch (level) { + case PG_LEVEL_4K: + pfn = pte_pfn(*kpte); + prot = pte_pgprot(*kpte); + break; + case PG_LEVEL_2M: + pfn = pmd_pfn(*(pmd_t *)kpte); + prot = pmd_pgprot(*(pmd_t *)kpte); + break; + case PG_LEVEL_1G: + pfn = pud_pfn(*(pud_t *)kpte); + prot = pud_pgprot(*(pud_t *)kpte); + break; + default: + WARN_ONCE(1, "Invalid level for kpte\n"); + return 0; + } + + if (ret_prot) + *ret_prot = prot; + + return pfn; +} + +static bool amd_enc_tlb_flush_required(bool enc) +{ + return true; +} + +static bool amd_enc_cache_flush_required(void) +{ + return !cpu_feature_enabled(X86_FEATURE_SME_COHERENT); +} + +static void enc_dec_hypercall(unsigned long vaddr, unsigned long size, bool enc) +{ +#ifdef CONFIG_PARAVIRT + unsigned long vaddr_end = vaddr + size; + + while (vaddr < vaddr_end) { + int psize, pmask, level; + unsigned long pfn; + pte_t *kpte; + + kpte = lookup_address(vaddr, &level); + if (!kpte || pte_none(*kpte)) { + WARN_ONCE(1, "kpte lookup for vaddr\n"); + return; + } + + pfn = pg_level_to_pfn(level, kpte, NULL); + if (!pfn) + continue; + + psize = page_level_size(level); + pmask = page_level_mask(level); + + notify_page_enc_status_changed(pfn, psize >> PAGE_SHIFT, enc); + + vaddr = (vaddr & pmask) + psize; + } +#endif +} + +static bool amd_enc_status_change_prepare(unsigned long vaddr, int npages, bool enc) +{ + /* + * To maintain the security guarantees of SEV-SNP guests, make sure + * to invalidate the memory before encryption attribute is cleared. + */ + if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP) && !enc) + snp_set_memory_shared(vaddr, npages); + + return true; +} + +/* Return true unconditionally: return value doesn't matter for the SEV side */ +static bool amd_enc_status_change_finish(unsigned long vaddr, int npages, bool enc) +{ + /* + * After memory is mapped encrypted in the page table, validate it + * so that it is consistent with the page table updates. + */ + if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP) && enc) + snp_set_memory_private(vaddr, npages); + + if (!cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) + enc_dec_hypercall(vaddr, npages << PAGE_SHIFT, enc); + + return true; +} + +static void __init __set_clr_pte_enc(pte_t *kpte, int level, bool enc) +{ + pgprot_t old_prot, new_prot; + unsigned long pfn, pa, size; + pte_t new_pte; + + pfn = pg_level_to_pfn(level, kpte, &old_prot); + if (!pfn) + return; + + new_prot = old_prot; + if (enc) + pgprot_val(new_prot) |= _PAGE_ENC; + else + pgprot_val(new_prot) &= ~_PAGE_ENC; + + /* If prot is same then do nothing. */ + if (pgprot_val(old_prot) == pgprot_val(new_prot)) + return; + + pa = pfn << PAGE_SHIFT; + size = page_level_size(level); + + /* + * We are going to perform in-place en-/decryption and change the + * physical page attribute from C=1 to C=0 or vice versa. Flush the + * caches to ensure that data gets accessed with the correct C-bit. + */ + clflush_cache_range(__va(pa), size); + + /* Encrypt/decrypt the contents in-place */ + if (enc) { + sme_early_encrypt(pa, size); + } else { + sme_early_decrypt(pa, size); + + /* + * ON SNP, the page state in the RMP table must happen + * before the page table updates. + */ + early_snp_set_memory_shared((unsigned long)__va(pa), pa, 1); + } + + /* Change the page encryption mask. */ + new_pte = pfn_pte(pfn, new_prot); + set_pte_atomic(kpte, new_pte); + + /* + * If page is set encrypted in the page table, then update the RMP table to + * add this page as private. + */ + if (enc) + early_snp_set_memory_private((unsigned long)__va(pa), pa, 1); +} + +static int __init early_set_memory_enc_dec(unsigned long vaddr, + unsigned long size, bool enc) +{ + unsigned long vaddr_end, vaddr_next, start; + unsigned long psize, pmask; + int split_page_size_mask; + int level, ret; + pte_t *kpte; + + start = vaddr; + vaddr_next = vaddr; + vaddr_end = vaddr + size; + + for (; vaddr < vaddr_end; vaddr = vaddr_next) { + kpte = lookup_address(vaddr, &level); + if (!kpte || pte_none(*kpte)) { + ret = 1; + goto out; + } + + if (level == PG_LEVEL_4K) { + __set_clr_pte_enc(kpte, level, enc); + vaddr_next = (vaddr & PAGE_MASK) + PAGE_SIZE; + continue; + } + + psize = page_level_size(level); + pmask = page_level_mask(level); + + /* + * Check whether we can change the large page in one go. + * We request a split when the address is not aligned and + * the number of pages to set/clear encryption bit is smaller + * than the number of pages in the large page. + */ + if (vaddr == (vaddr & pmask) && + ((vaddr_end - vaddr) >= psize)) { + __set_clr_pte_enc(kpte, level, enc); + vaddr_next = (vaddr & pmask) + psize; + continue; + } + + /* + * The virtual address is part of a larger page, create the next + * level page table mapping (4K or 2M). If it is part of a 2M + * page then we request a split of the large page into 4K + * chunks. A 1GB large page is split into 2M pages, resp. + */ + if (level == PG_LEVEL_2M) + split_page_size_mask = 0; + else + split_page_size_mask = 1 << PG_LEVEL_2M; + + /* + * kernel_physical_mapping_change() does not flush the TLBs, so + * a TLB flush is required after we exit from the for loop. + */ + kernel_physical_mapping_change(__pa(vaddr & pmask), + __pa((vaddr_end & pmask) + psize), + split_page_size_mask); + } + + ret = 0; + + early_set_mem_enc_dec_hypercall(start, size, enc); +out: + __flush_tlb_all(); + return ret; +} + +int __init early_set_memory_decrypted(unsigned long vaddr, unsigned long size) +{ + return early_set_memory_enc_dec(vaddr, size, false); +} + +int __init early_set_memory_encrypted(unsigned long vaddr, unsigned long size) +{ + return early_set_memory_enc_dec(vaddr, size, true); +} + +void __init early_set_mem_enc_dec_hypercall(unsigned long vaddr, unsigned long size, bool enc) +{ + enc_dec_hypercall(vaddr, size, enc); +} + +void __init sme_early_init(void) +{ + if (!sme_me_mask) + return; + + early_pmd_flags = __sme_set(early_pmd_flags); + + __supported_pte_mask = __sme_set(__supported_pte_mask); + + /* Update the protection map with memory encryption mask */ + add_encrypt_protection_map(); + + x86_platform.guest.enc_status_change_prepare = amd_enc_status_change_prepare; + x86_platform.guest.enc_status_change_finish = amd_enc_status_change_finish; + x86_platform.guest.enc_tlb_flush_required = amd_enc_tlb_flush_required; + x86_platform.guest.enc_cache_flush_required = amd_enc_cache_flush_required; + + /* + * AMD-SEV-ES intercepts the RDMSR to read the X2APIC ID in the + * parallel bringup low level code. That raises #VC which cannot be + * handled there. + * It does not provide a RDMSR GHCB protocol so the early startup + * code cannot directly communicate with the secure firmware. The + * alternative solution to retrieve the APIC ID via CPUID(0xb), + * which is covered by the GHCB protocol, is not viable either + * because there is no enforcement of the CPUID(0xb) provided + * "initial" APIC ID to be the same as the real APIC ID. + * Disable parallel bootup. + */ + if (sev_status & MSR_AMD64_SEV_ES_ENABLED) + x86_cpuinit.parallel_bringup = false; + + /* + * The VMM is capable of injecting interrupt 0x80 and triggering the + * compatibility syscall path. + * + * By default, the 32-bit emulation is disabled in order to ensure + * the safety of the VM. + */ + if (sev_status & MSR_AMD64_SEV_ENABLED) + ia32_disable(); +} + +void __init mem_encrypt_free_decrypted_mem(void) +{ + unsigned long vaddr, vaddr_end, npages; + int r; + + vaddr = (unsigned long)__start_bss_decrypted_unused; + vaddr_end = (unsigned long)__end_bss_decrypted; + npages = (vaddr_end - vaddr) >> PAGE_SHIFT; + + /* + * If the unused memory range was mapped decrypted, change the encryption + * attribute from decrypted to encrypted before freeing it. Base the + * re-encryption on the same condition used for the decryption in + * sme_postprocess_startup(). Higher level abstractions, such as + * CC_ATTR_MEM_ENCRYPT, aren't necessarily equivalent in a Hyper-V VM + * using vTOM, where sme_me_mask is always zero. + */ + if (sme_me_mask) { + r = set_memory_encrypted(vaddr, npages); + if (r) { + pr_warn("failed to free unused decrypted pages\n"); + return; + } + } + + free_init_pages("unused decrypted", vaddr, vaddr_end); +} diff --git a/arch/x86/mm/mem_encrypt_boot.S b/arch/x86/mm/mem_encrypt_boot.S new file mode 100644 index 0000000000..e25288ee33 --- /dev/null +++ b/arch/x86/mm/mem_encrypt_boot.S @@ -0,0 +1,162 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * AMD Memory Encryption Support + * + * Copyright (C) 2016 Advanced Micro Devices, Inc. + * + * Author: Tom Lendacky <thomas.lendacky@amd.com> + */ + +#include <linux/linkage.h> +#include <linux/pgtable.h> +#include <asm/page.h> +#include <asm/processor-flags.h> +#include <asm/msr-index.h> +#include <asm/nospec-branch.h> + + .text + .code64 +SYM_FUNC_START(sme_encrypt_execute) + + /* + * Entry parameters: + * RDI - virtual address for the encrypted mapping + * RSI - virtual address for the decrypted mapping + * RDX - length to encrypt + * RCX - virtual address of the encryption workarea, including: + * - stack page (PAGE_SIZE) + * - encryption routine page (PAGE_SIZE) + * - intermediate copy buffer (PMD_SIZE) + * R8 - physical address of the pagetables to use for encryption + */ + + push %rbp + movq %rsp, %rbp /* RBP now has original stack pointer */ + + /* Set up a one page stack in the non-encrypted memory area */ + movq %rcx, %rax /* Workarea stack page */ + leaq PAGE_SIZE(%rax), %rsp /* Set new stack pointer */ + addq $PAGE_SIZE, %rax /* Workarea encryption routine */ + + push %r12 + movq %rdi, %r10 /* Encrypted area */ + movq %rsi, %r11 /* Decrypted area */ + movq %rdx, %r12 /* Area length */ + + /* Copy encryption routine into the workarea */ + movq %rax, %rdi /* Workarea encryption routine */ + leaq __enc_copy(%rip), %rsi /* Encryption routine */ + movq $(.L__enc_copy_end - __enc_copy), %rcx /* Encryption routine length */ + rep movsb + + /* Setup registers for call */ + movq %r10, %rdi /* Encrypted area */ + movq %r11, %rsi /* Decrypted area */ + movq %r8, %rdx /* Pagetables used for encryption */ + movq %r12, %rcx /* Area length */ + movq %rax, %r8 /* Workarea encryption routine */ + addq $PAGE_SIZE, %r8 /* Workarea intermediate copy buffer */ + + ANNOTATE_RETPOLINE_SAFE + call *%rax /* Call the encryption routine */ + + pop %r12 + + movq %rbp, %rsp /* Restore original stack pointer */ + pop %rbp + + /* Offset to __x86_return_thunk would be wrong here */ + ANNOTATE_UNRET_SAFE + ret + int3 +SYM_FUNC_END(sme_encrypt_execute) + +SYM_FUNC_START(__enc_copy) +/* + * Routine used to encrypt memory in place. + * This routine must be run outside of the kernel proper since + * the kernel will be encrypted during the process. So this + * routine is defined here and then copied to an area outside + * of the kernel where it will remain and run decrypted + * during execution. + * + * On entry the registers must be: + * RDI - virtual address for the encrypted mapping + * RSI - virtual address for the decrypted mapping + * RDX - address of the pagetables to use for encryption + * RCX - length of area + * R8 - intermediate copy buffer + * + * RAX - points to this routine + * + * The area will be encrypted by copying from the non-encrypted + * memory space to an intermediate buffer and then copying from the + * intermediate buffer back to the encrypted memory space. The physical + * addresses of the two mappings are the same which results in the area + * being encrypted "in place". + */ + /* Enable the new page tables */ + mov %rdx, %cr3 + + /* Flush any global TLBs */ + mov %cr4, %rdx + andq $~X86_CR4_PGE, %rdx + mov %rdx, %cr4 + orq $X86_CR4_PGE, %rdx + mov %rdx, %cr4 + + push %r15 + push %r12 + + movq %rcx, %r9 /* Save area length */ + movq %rdi, %r10 /* Save encrypted area address */ + movq %rsi, %r11 /* Save decrypted area address */ + + /* Set the PAT register PA5 entry to write-protect */ + movl $MSR_IA32_CR_PAT, %ecx + rdmsr + mov %rdx, %r15 /* Save original PAT value */ + andl $0xffff00ff, %edx /* Clear PA5 */ + orl $0x00000500, %edx /* Set PA5 to WP */ + wrmsr + + wbinvd /* Invalidate any cache entries */ + + /* Copy/encrypt up to 2MB at a time */ + movq $PMD_SIZE, %r12 +1: + cmpq %r12, %r9 + jnb 2f + movq %r9, %r12 + +2: + movq %r11, %rsi /* Source - decrypted area */ + movq %r8, %rdi /* Dest - intermediate copy buffer */ + movq %r12, %rcx + rep movsb + + movq %r8, %rsi /* Source - intermediate copy buffer */ + movq %r10, %rdi /* Dest - encrypted area */ + movq %r12, %rcx + rep movsb + + addq %r12, %r11 + addq %r12, %r10 + subq %r12, %r9 /* Kernel length decrement */ + jnz 1b /* Kernel length not zero? */ + + /* Restore PAT register */ + movl $MSR_IA32_CR_PAT, %ecx + rdmsr + mov %r15, %rdx /* Restore original PAT value */ + wrmsr + + pop %r12 + pop %r15 + + /* Offset to __x86_return_thunk would be wrong here */ + ANNOTATE_UNRET_SAFE + ret + int3 +.L__enc_copy_end: +SYM_FUNC_END(__enc_copy) diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/mm/mem_encrypt_identity.c new file mode 100644 index 0000000000..d73aeb1641 --- /dev/null +++ b/arch/x86/mm/mem_encrypt_identity.c @@ -0,0 +1,618 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * AMD Memory Encryption Support + * + * Copyright (C) 2016 Advanced Micro Devices, Inc. + * + * Author: Tom Lendacky <thomas.lendacky@amd.com> + */ + +#define DISABLE_BRANCH_PROFILING + +/* + * Since we're dealing with identity mappings, physical and virtual + * addresses are the same, so override these defines which are ultimately + * used by the headers in misc.h. + */ +#define __pa(x) ((unsigned long)(x)) +#define __va(x) ((void *)((unsigned long)(x))) + +/* + * Special hack: we have to be careful, because no indirections are + * allowed here, and paravirt_ops is a kind of one. As it will only run in + * baremetal anyway, we just keep it from happening. (This list needs to + * be extended when new paravirt and debugging variants are added.) + */ +#undef CONFIG_PARAVIRT +#undef CONFIG_PARAVIRT_XXL +#undef CONFIG_PARAVIRT_SPINLOCKS + +/* + * This code runs before CPU feature bits are set. By default, the + * pgtable_l5_enabled() function uses bit X86_FEATURE_LA57 to determine if + * 5-level paging is active, so that won't work here. USE_EARLY_PGTABLE_L5 + * is provided to handle this situation and, instead, use a variable that + * has been set by the early boot code. + */ +#define USE_EARLY_PGTABLE_L5 + +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/mem_encrypt.h> +#include <linux/cc_platform.h> + +#include <asm/setup.h> +#include <asm/sections.h> +#include <asm/cmdline.h> +#include <asm/coco.h> +#include <asm/sev.h> + +#include "mm_internal.h" + +#define PGD_FLAGS _KERNPG_TABLE_NOENC +#define P4D_FLAGS _KERNPG_TABLE_NOENC +#define PUD_FLAGS _KERNPG_TABLE_NOENC +#define PMD_FLAGS _KERNPG_TABLE_NOENC + +#define PMD_FLAGS_LARGE (__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL) + +#define PMD_FLAGS_DEC PMD_FLAGS_LARGE +#define PMD_FLAGS_DEC_WP ((PMD_FLAGS_DEC & ~_PAGE_LARGE_CACHE_MASK) | \ + (_PAGE_PAT_LARGE | _PAGE_PWT)) + +#define PMD_FLAGS_ENC (PMD_FLAGS_LARGE | _PAGE_ENC) + +#define PTE_FLAGS (__PAGE_KERNEL_EXEC & ~_PAGE_GLOBAL) + +#define PTE_FLAGS_DEC PTE_FLAGS +#define PTE_FLAGS_DEC_WP ((PTE_FLAGS_DEC & ~_PAGE_CACHE_MASK) | \ + (_PAGE_PAT | _PAGE_PWT)) + +#define PTE_FLAGS_ENC (PTE_FLAGS | _PAGE_ENC) + +struct sme_populate_pgd_data { + void *pgtable_area; + pgd_t *pgd; + + pmdval_t pmd_flags; + pteval_t pte_flags; + unsigned long paddr; + + unsigned long vaddr; + unsigned long vaddr_end; +}; + +/* + * This work area lives in the .init.scratch section, which lives outside of + * the kernel proper. It is sized to hold the intermediate copy buffer and + * more than enough pagetable pages. + * + * By using this section, the kernel can be encrypted in place and it + * avoids any possibility of boot parameters or initramfs images being + * placed such that the in-place encryption logic overwrites them. This + * section is 2MB aligned to allow for simple pagetable setup using only + * PMD entries (see vmlinux.lds.S). + */ +static char sme_workarea[2 * PMD_SIZE] __section(".init.scratch"); + +static char sme_cmdline_arg[] __initdata = "mem_encrypt"; +static char sme_cmdline_on[] __initdata = "on"; +static char sme_cmdline_off[] __initdata = "off"; + +static void __init sme_clear_pgd(struct sme_populate_pgd_data *ppd) +{ + unsigned long pgd_start, pgd_end, pgd_size; + pgd_t *pgd_p; + + pgd_start = ppd->vaddr & PGDIR_MASK; + pgd_end = ppd->vaddr_end & PGDIR_MASK; + + pgd_size = (((pgd_end - pgd_start) / PGDIR_SIZE) + 1) * sizeof(pgd_t); + + pgd_p = ppd->pgd + pgd_index(ppd->vaddr); + + memset(pgd_p, 0, pgd_size); +} + +static pud_t __init *sme_prepare_pgd(struct sme_populate_pgd_data *ppd) +{ + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + + pgd = ppd->pgd + pgd_index(ppd->vaddr); + if (pgd_none(*pgd)) { + p4d = ppd->pgtable_area; + memset(p4d, 0, sizeof(*p4d) * PTRS_PER_P4D); + ppd->pgtable_area += sizeof(*p4d) * PTRS_PER_P4D; + set_pgd(pgd, __pgd(PGD_FLAGS | __pa(p4d))); + } + + p4d = p4d_offset(pgd, ppd->vaddr); + if (p4d_none(*p4d)) { + pud = ppd->pgtable_area; + memset(pud, 0, sizeof(*pud) * PTRS_PER_PUD); + ppd->pgtable_area += sizeof(*pud) * PTRS_PER_PUD; + set_p4d(p4d, __p4d(P4D_FLAGS | __pa(pud))); + } + + pud = pud_offset(p4d, ppd->vaddr); + if (pud_none(*pud)) { + pmd = ppd->pgtable_area; + memset(pmd, 0, sizeof(*pmd) * PTRS_PER_PMD); + ppd->pgtable_area += sizeof(*pmd) * PTRS_PER_PMD; + set_pud(pud, __pud(PUD_FLAGS | __pa(pmd))); + } + + if (pud_large(*pud)) + return NULL; + + return pud; +} + +static void __init sme_populate_pgd_large(struct sme_populate_pgd_data *ppd) +{ + pud_t *pud; + pmd_t *pmd; + + pud = sme_prepare_pgd(ppd); + if (!pud) + return; + + pmd = pmd_offset(pud, ppd->vaddr); + if (pmd_large(*pmd)) + return; + + set_pmd(pmd, __pmd(ppd->paddr | ppd->pmd_flags)); +} + +static void __init sme_populate_pgd(struct sme_populate_pgd_data *ppd) +{ + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + pud = sme_prepare_pgd(ppd); + if (!pud) + return; + + pmd = pmd_offset(pud, ppd->vaddr); + if (pmd_none(*pmd)) { + pte = ppd->pgtable_area; + memset(pte, 0, sizeof(*pte) * PTRS_PER_PTE); + ppd->pgtable_area += sizeof(*pte) * PTRS_PER_PTE; + set_pmd(pmd, __pmd(PMD_FLAGS | __pa(pte))); + } + + if (pmd_large(*pmd)) + return; + + pte = pte_offset_kernel(pmd, ppd->vaddr); + if (pte_none(*pte)) + set_pte(pte, __pte(ppd->paddr | ppd->pte_flags)); +} + +static void __init __sme_map_range_pmd(struct sme_populate_pgd_data *ppd) +{ + while (ppd->vaddr < ppd->vaddr_end) { + sme_populate_pgd_large(ppd); + + ppd->vaddr += PMD_SIZE; + ppd->paddr += PMD_SIZE; + } +} + +static void __init __sme_map_range_pte(struct sme_populate_pgd_data *ppd) +{ + while (ppd->vaddr < ppd->vaddr_end) { + sme_populate_pgd(ppd); + + ppd->vaddr += PAGE_SIZE; + ppd->paddr += PAGE_SIZE; + } +} + +static void __init __sme_map_range(struct sme_populate_pgd_data *ppd, + pmdval_t pmd_flags, pteval_t pte_flags) +{ + unsigned long vaddr_end; + + ppd->pmd_flags = pmd_flags; + ppd->pte_flags = pte_flags; + + /* Save original end value since we modify the struct value */ + vaddr_end = ppd->vaddr_end; + + /* If start is not 2MB aligned, create PTE entries */ + ppd->vaddr_end = ALIGN(ppd->vaddr, PMD_SIZE); + __sme_map_range_pte(ppd); + + /* Create PMD entries */ + ppd->vaddr_end = vaddr_end & PMD_MASK; + __sme_map_range_pmd(ppd); + + /* If end is not 2MB aligned, create PTE entries */ + ppd->vaddr_end = vaddr_end; + __sme_map_range_pte(ppd); +} + +static void __init sme_map_range_encrypted(struct sme_populate_pgd_data *ppd) +{ + __sme_map_range(ppd, PMD_FLAGS_ENC, PTE_FLAGS_ENC); +} + +static void __init sme_map_range_decrypted(struct sme_populate_pgd_data *ppd) +{ + __sme_map_range(ppd, PMD_FLAGS_DEC, PTE_FLAGS_DEC); +} + +static void __init sme_map_range_decrypted_wp(struct sme_populate_pgd_data *ppd) +{ + __sme_map_range(ppd, PMD_FLAGS_DEC_WP, PTE_FLAGS_DEC_WP); +} + +static unsigned long __init sme_pgtable_calc(unsigned long len) +{ + unsigned long entries = 0, tables = 0; + + /* + * Perform a relatively simplistic calculation of the pagetable + * entries that are needed. Those mappings will be covered mostly + * by 2MB PMD entries so we can conservatively calculate the required + * number of P4D, PUD and PMD structures needed to perform the + * mappings. For mappings that are not 2MB aligned, PTE mappings + * would be needed for the start and end portion of the address range + * that fall outside of the 2MB alignment. This results in, at most, + * two extra pages to hold PTE entries for each range that is mapped. + * Incrementing the count for each covers the case where the addresses + * cross entries. + */ + + /* PGDIR_SIZE is equal to P4D_SIZE on 4-level machine. */ + if (PTRS_PER_P4D > 1) + entries += (DIV_ROUND_UP(len, PGDIR_SIZE) + 1) * sizeof(p4d_t) * PTRS_PER_P4D; + entries += (DIV_ROUND_UP(len, P4D_SIZE) + 1) * sizeof(pud_t) * PTRS_PER_PUD; + entries += (DIV_ROUND_UP(len, PUD_SIZE) + 1) * sizeof(pmd_t) * PTRS_PER_PMD; + entries += 2 * sizeof(pte_t) * PTRS_PER_PTE; + + /* + * Now calculate the added pagetable structures needed to populate + * the new pagetables. + */ + + if (PTRS_PER_P4D > 1) + tables += DIV_ROUND_UP(entries, PGDIR_SIZE) * sizeof(p4d_t) * PTRS_PER_P4D; + tables += DIV_ROUND_UP(entries, P4D_SIZE) * sizeof(pud_t) * PTRS_PER_PUD; + tables += DIV_ROUND_UP(entries, PUD_SIZE) * sizeof(pmd_t) * PTRS_PER_PMD; + + return entries + tables; +} + +void __init sme_encrypt_kernel(struct boot_params *bp) +{ + unsigned long workarea_start, workarea_end, workarea_len; + unsigned long execute_start, execute_end, execute_len; + unsigned long kernel_start, kernel_end, kernel_len; + unsigned long initrd_start, initrd_end, initrd_len; + struct sme_populate_pgd_data ppd; + unsigned long pgtable_area_len; + unsigned long decrypted_base; + + /* + * This is early code, use an open coded check for SME instead of + * using cc_platform_has(). This eliminates worries about removing + * instrumentation or checking boot_cpu_data in the cc_platform_has() + * function. + */ + if (!sme_get_me_mask() || sev_status & MSR_AMD64_SEV_ENABLED) + return; + + /* + * Prepare for encrypting the kernel and initrd by building new + * pagetables with the necessary attributes needed to encrypt the + * kernel in place. + * + * One range of virtual addresses will map the memory occupied + * by the kernel and initrd as encrypted. + * + * Another range of virtual addresses will map the memory occupied + * by the kernel and initrd as decrypted and write-protected. + * + * The use of write-protect attribute will prevent any of the + * memory from being cached. + */ + + /* Physical addresses gives us the identity mapped virtual addresses */ + kernel_start = __pa_symbol(_text); + kernel_end = ALIGN(__pa_symbol(_end), PMD_SIZE); + kernel_len = kernel_end - kernel_start; + + initrd_start = 0; + initrd_end = 0; + initrd_len = 0; +#ifdef CONFIG_BLK_DEV_INITRD + initrd_len = (unsigned long)bp->hdr.ramdisk_size | + ((unsigned long)bp->ext_ramdisk_size << 32); + if (initrd_len) { + initrd_start = (unsigned long)bp->hdr.ramdisk_image | + ((unsigned long)bp->ext_ramdisk_image << 32); + initrd_end = PAGE_ALIGN(initrd_start + initrd_len); + initrd_len = initrd_end - initrd_start; + } +#endif + + /* + * We're running identity mapped, so we must obtain the address to the + * SME encryption workarea using rip-relative addressing. + */ + asm ("lea sme_workarea(%%rip), %0" + : "=r" (workarea_start) + : "p" (sme_workarea)); + + /* + * Calculate required number of workarea bytes needed: + * executable encryption area size: + * stack page (PAGE_SIZE) + * encryption routine page (PAGE_SIZE) + * intermediate copy buffer (PMD_SIZE) + * pagetable structures for the encryption of the kernel + * pagetable structures for workarea (in case not currently mapped) + */ + execute_start = workarea_start; + execute_end = execute_start + (PAGE_SIZE * 2) + PMD_SIZE; + execute_len = execute_end - execute_start; + + /* + * One PGD for both encrypted and decrypted mappings and a set of + * PUDs and PMDs for each of the encrypted and decrypted mappings. + */ + pgtable_area_len = sizeof(pgd_t) * PTRS_PER_PGD; + pgtable_area_len += sme_pgtable_calc(execute_end - kernel_start) * 2; + if (initrd_len) + pgtable_area_len += sme_pgtable_calc(initrd_len) * 2; + + /* PUDs and PMDs needed in the current pagetables for the workarea */ + pgtable_area_len += sme_pgtable_calc(execute_len + pgtable_area_len); + + /* + * The total workarea includes the executable encryption area and + * the pagetable area. The start of the workarea is already 2MB + * aligned, align the end of the workarea on a 2MB boundary so that + * we don't try to create/allocate PTE entries from the workarea + * before it is mapped. + */ + workarea_len = execute_len + pgtable_area_len; + workarea_end = ALIGN(workarea_start + workarea_len, PMD_SIZE); + + /* + * Set the address to the start of where newly created pagetable + * structures (PGDs, PUDs and PMDs) will be allocated. New pagetable + * structures are created when the workarea is added to the current + * pagetables and when the new encrypted and decrypted kernel + * mappings are populated. + */ + ppd.pgtable_area = (void *)execute_end; + + /* + * Make sure the current pagetable structure has entries for + * addressing the workarea. + */ + ppd.pgd = (pgd_t *)native_read_cr3_pa(); + ppd.paddr = workarea_start; + ppd.vaddr = workarea_start; + ppd.vaddr_end = workarea_end; + sme_map_range_decrypted(&ppd); + + /* Flush the TLB - no globals so cr3 is enough */ + native_write_cr3(__native_read_cr3()); + + /* + * A new pagetable structure is being built to allow for the kernel + * and initrd to be encrypted. It starts with an empty PGD that will + * then be populated with new PUDs and PMDs as the encrypted and + * decrypted kernel mappings are created. + */ + ppd.pgd = ppd.pgtable_area; + memset(ppd.pgd, 0, sizeof(pgd_t) * PTRS_PER_PGD); + ppd.pgtable_area += sizeof(pgd_t) * PTRS_PER_PGD; + + /* + * A different PGD index/entry must be used to get different + * pagetable entries for the decrypted mapping. Choose the next + * PGD index and convert it to a virtual address to be used as + * the base of the mapping. + */ + decrypted_base = (pgd_index(workarea_end) + 1) & (PTRS_PER_PGD - 1); + if (initrd_len) { + unsigned long check_base; + + check_base = (pgd_index(initrd_end) + 1) & (PTRS_PER_PGD - 1); + decrypted_base = max(decrypted_base, check_base); + } + decrypted_base <<= PGDIR_SHIFT; + + /* Add encrypted kernel (identity) mappings */ + ppd.paddr = kernel_start; + ppd.vaddr = kernel_start; + ppd.vaddr_end = kernel_end; + sme_map_range_encrypted(&ppd); + + /* Add decrypted, write-protected kernel (non-identity) mappings */ + ppd.paddr = kernel_start; + ppd.vaddr = kernel_start + decrypted_base; + ppd.vaddr_end = kernel_end + decrypted_base; + sme_map_range_decrypted_wp(&ppd); + + if (initrd_len) { + /* Add encrypted initrd (identity) mappings */ + ppd.paddr = initrd_start; + ppd.vaddr = initrd_start; + ppd.vaddr_end = initrd_end; + sme_map_range_encrypted(&ppd); + /* + * Add decrypted, write-protected initrd (non-identity) mappings + */ + ppd.paddr = initrd_start; + ppd.vaddr = initrd_start + decrypted_base; + ppd.vaddr_end = initrd_end + decrypted_base; + sme_map_range_decrypted_wp(&ppd); + } + + /* Add decrypted workarea mappings to both kernel mappings */ + ppd.paddr = workarea_start; + ppd.vaddr = workarea_start; + ppd.vaddr_end = workarea_end; + sme_map_range_decrypted(&ppd); + + ppd.paddr = workarea_start; + ppd.vaddr = workarea_start + decrypted_base; + ppd.vaddr_end = workarea_end + decrypted_base; + sme_map_range_decrypted(&ppd); + + /* Perform the encryption */ + sme_encrypt_execute(kernel_start, kernel_start + decrypted_base, + kernel_len, workarea_start, (unsigned long)ppd.pgd); + + if (initrd_len) + sme_encrypt_execute(initrd_start, initrd_start + decrypted_base, + initrd_len, workarea_start, + (unsigned long)ppd.pgd); + + /* + * At this point we are running encrypted. Remove the mappings for + * the decrypted areas - all that is needed for this is to remove + * the PGD entry/entries. + */ + ppd.vaddr = kernel_start + decrypted_base; + ppd.vaddr_end = kernel_end + decrypted_base; + sme_clear_pgd(&ppd); + + if (initrd_len) { + ppd.vaddr = initrd_start + decrypted_base; + ppd.vaddr_end = initrd_end + decrypted_base; + sme_clear_pgd(&ppd); + } + + ppd.vaddr = workarea_start + decrypted_base; + ppd.vaddr_end = workarea_end + decrypted_base; + sme_clear_pgd(&ppd); + + /* Flush the TLB - no globals so cr3 is enough */ + native_write_cr3(__native_read_cr3()); +} + +void __init sme_enable(struct boot_params *bp) +{ + const char *cmdline_ptr, *cmdline_arg, *cmdline_on, *cmdline_off; + unsigned int eax, ebx, ecx, edx; + unsigned long feature_mask; + bool active_by_default; + unsigned long me_mask; + char buffer[16]; + bool snp; + u64 msr; + + snp = snp_init(bp); + + /* Check for the SME/SEV support leaf */ + eax = 0x80000000; + ecx = 0; + native_cpuid(&eax, &ebx, &ecx, &edx); + if (eax < 0x8000001f) + return; + +#define AMD_SME_BIT BIT(0) +#define AMD_SEV_BIT BIT(1) + + /* + * Check for the SME/SEV feature: + * CPUID Fn8000_001F[EAX] + * - Bit 0 - Secure Memory Encryption support + * - Bit 1 - Secure Encrypted Virtualization support + * CPUID Fn8000_001F[EBX] + * - Bits 5:0 - Pagetable bit position used to indicate encryption + */ + eax = 0x8000001f; + ecx = 0; + native_cpuid(&eax, &ebx, &ecx, &edx); + /* Check whether SEV or SME is supported */ + if (!(eax & (AMD_SEV_BIT | AMD_SME_BIT))) + return; + + me_mask = 1UL << (ebx & 0x3f); + + /* Check the SEV MSR whether SEV or SME is enabled */ + sev_status = __rdmsr(MSR_AMD64_SEV); + feature_mask = (sev_status & MSR_AMD64_SEV_ENABLED) ? AMD_SEV_BIT : AMD_SME_BIT; + + /* The SEV-SNP CC blob should never be present unless SEV-SNP is enabled. */ + if (snp && !(sev_status & MSR_AMD64_SEV_SNP_ENABLED)) + snp_abort(); + + /* Check if memory encryption is enabled */ + if (feature_mask == AMD_SME_BIT) { + /* + * No SME if Hypervisor bit is set. This check is here to + * prevent a guest from trying to enable SME. For running as a + * KVM guest the MSR_AMD64_SYSCFG will be sufficient, but there + * might be other hypervisors which emulate that MSR as non-zero + * or even pass it through to the guest. + * A malicious hypervisor can still trick a guest into this + * path, but there is no way to protect against that. + */ + eax = 1; + ecx = 0; + native_cpuid(&eax, &ebx, &ecx, &edx); + if (ecx & BIT(31)) + return; + + /* For SME, check the SYSCFG MSR */ + msr = __rdmsr(MSR_AMD64_SYSCFG); + if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT)) + return; + } else { + /* SEV state cannot be controlled by a command line option */ + sme_me_mask = me_mask; + goto out; + } + + /* + * Fixups have not been applied to phys_base yet and we're running + * identity mapped, so we must obtain the address to the SME command + * line argument data using rip-relative addressing. + */ + asm ("lea sme_cmdline_arg(%%rip), %0" + : "=r" (cmdline_arg) + : "p" (sme_cmdline_arg)); + asm ("lea sme_cmdline_on(%%rip), %0" + : "=r" (cmdline_on) + : "p" (sme_cmdline_on)); + asm ("lea sme_cmdline_off(%%rip), %0" + : "=r" (cmdline_off) + : "p" (sme_cmdline_off)); + + if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT)) + active_by_default = true; + else + active_by_default = false; + + cmdline_ptr = (const char *)((u64)bp->hdr.cmd_line_ptr | + ((u64)bp->ext_cmd_line_ptr << 32)); + + if (cmdline_find_option(cmdline_ptr, cmdline_arg, buffer, sizeof(buffer)) < 0) + return; + + if (!strncmp(buffer, cmdline_on, sizeof(buffer))) + sme_me_mask = me_mask; + else if (!strncmp(buffer, cmdline_off, sizeof(buffer))) + sme_me_mask = 0; + else + sme_me_mask = active_by_default ? me_mask : 0; +out: + if (sme_me_mask) { + physical_mask &= ~sme_me_mask; + cc_vendor = CC_VENDOR_AMD; + cc_set_mask(sme_me_mask); + } +} diff --git a/arch/x86/mm/mm_internal.h b/arch/x86/mm/mm_internal.h new file mode 100644 index 0000000000..3f37b5c80b --- /dev/null +++ b/arch/x86/mm/mm_internal.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __X86_MM_INTERNAL_H +#define __X86_MM_INTERNAL_H + +void *alloc_low_pages(unsigned int num); +static inline void *alloc_low_page(void) +{ + return alloc_low_pages(1); +} + +void early_ioremap_page_table_range_init(void); + +unsigned long kernel_physical_mapping_init(unsigned long start, + unsigned long end, + unsigned long page_size_mask, + pgprot_t prot); +unsigned long kernel_physical_mapping_change(unsigned long start, + unsigned long end, + unsigned long page_size_mask); +void zone_sizes_init(void); + +extern int after_bootmem; + +void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache); + +extern unsigned long tlb_single_page_flush_ceiling; + +#endif /* __X86_MM_INTERNAL_H */ diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c new file mode 100644 index 0000000000..c90c20904a --- /dev/null +++ b/arch/x86/mm/mmap.c @@ -0,0 +1,250 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Flexible mmap layout support + * + * Based on code by Ingo Molnar and Andi Kleen, copyrighted + * as follows: + * + * Copyright 2003-2009 Red Hat Inc. + * All Rights Reserved. + * Copyright 2005 Andi Kleen, SUSE Labs. + * Copyright 2007 Jiri Kosina, SUSE Labs. + */ + +#include <linux/personality.h> +#include <linux/mm.h> +#include <linux/random.h> +#include <linux/limits.h> +#include <linux/sched/signal.h> +#include <linux/sched/mm.h> +#include <linux/compat.h> +#include <linux/elf-randomize.h> +#include <asm/elf.h> +#include <asm/io.h> + +#include "physaddr.h" + +struct va_alignment __read_mostly va_align = { + .flags = -1, +}; + +unsigned long task_size_32bit(void) +{ + return IA32_PAGE_OFFSET; +} + +unsigned long task_size_64bit(int full_addr_space) +{ + return full_addr_space ? TASK_SIZE_MAX : DEFAULT_MAP_WINDOW; +} + +static unsigned long stack_maxrandom_size(unsigned long task_size) +{ + unsigned long max = 0; + if (current->flags & PF_RANDOMIZE) { + max = (-1UL) & __STACK_RND_MASK(task_size == task_size_32bit()); + max <<= PAGE_SHIFT; + } + + return max; +} + +#ifdef CONFIG_COMPAT +# define mmap32_rnd_bits mmap_rnd_compat_bits +# define mmap64_rnd_bits mmap_rnd_bits +#else +# define mmap32_rnd_bits mmap_rnd_bits +# define mmap64_rnd_bits mmap_rnd_bits +#endif + +#define SIZE_128M (128 * 1024 * 1024UL) + +static int mmap_is_legacy(void) +{ + if (current->personality & ADDR_COMPAT_LAYOUT) + return 1; + + return sysctl_legacy_va_layout; +} + +static unsigned long arch_rnd(unsigned int rndbits) +{ + if (!(current->flags & PF_RANDOMIZE)) + return 0; + return (get_random_long() & ((1UL << rndbits) - 1)) << PAGE_SHIFT; +} + +unsigned long arch_mmap_rnd(void) +{ + return arch_rnd(mmap_is_ia32() ? mmap32_rnd_bits : mmap64_rnd_bits); +} + +static unsigned long mmap_base(unsigned long rnd, unsigned long task_size, + struct rlimit *rlim_stack) +{ + unsigned long gap = rlim_stack->rlim_cur; + unsigned long pad = stack_maxrandom_size(task_size) + stack_guard_gap; + unsigned long gap_min, gap_max; + + /* Values close to RLIM_INFINITY can overflow. */ + if (gap + pad > gap) + gap += pad; + + /* + * Top of mmap area (just below the process stack). + * Leave an at least ~128 MB hole with possible stack randomization. + */ + gap_min = SIZE_128M; + gap_max = (task_size / 6) * 5; + + if (gap < gap_min) + gap = gap_min; + else if (gap > gap_max) + gap = gap_max; + + return PAGE_ALIGN(task_size - gap - rnd); +} + +static unsigned long mmap_legacy_base(unsigned long rnd, + unsigned long task_size) +{ + return __TASK_UNMAPPED_BASE(task_size) + rnd; +} + +/* + * This function, called very early during the creation of a new + * process VM image, sets up which VM layout function to use: + */ +static void arch_pick_mmap_base(unsigned long *base, unsigned long *legacy_base, + unsigned long random_factor, unsigned long task_size, + struct rlimit *rlim_stack) +{ + *legacy_base = mmap_legacy_base(random_factor, task_size); + if (mmap_is_legacy()) + *base = *legacy_base; + else + *base = mmap_base(random_factor, task_size, rlim_stack); +} + +void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) +{ + if (mmap_is_legacy()) + mm->get_unmapped_area = arch_get_unmapped_area; + else + mm->get_unmapped_area = arch_get_unmapped_area_topdown; + + arch_pick_mmap_base(&mm->mmap_base, &mm->mmap_legacy_base, + arch_rnd(mmap64_rnd_bits), task_size_64bit(0), + rlim_stack); + +#ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES + /* + * The mmap syscall mapping base decision depends solely on the + * syscall type (64-bit or compat). This applies for 64bit + * applications and 32bit applications. The 64bit syscall uses + * mmap_base, the compat syscall uses mmap_compat_base. + */ + arch_pick_mmap_base(&mm->mmap_compat_base, &mm->mmap_compat_legacy_base, + arch_rnd(mmap32_rnd_bits), task_size_32bit(), + rlim_stack); +#endif +} + +unsigned long get_mmap_base(int is_legacy) +{ + struct mm_struct *mm = current->mm; + +#ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES + if (in_32bit_syscall()) { + return is_legacy ? mm->mmap_compat_legacy_base + : mm->mmap_compat_base; + } +#endif + return is_legacy ? mm->mmap_legacy_base : mm->mmap_base; +} + +const char *arch_vma_name(struct vm_area_struct *vma) +{ + return NULL; +} + +/** + * mmap_address_hint_valid - Validate the address hint of mmap + * @addr: Address hint + * @len: Mapping length + * + * Check whether @addr and @addr + @len result in a valid mapping. + * + * On 32bit this only checks whether @addr + @len is <= TASK_SIZE. + * + * On 64bit with 5-level page tables another sanity check is required + * because mappings requested by mmap(@addr, 0) which cross the 47-bit + * virtual address boundary can cause the following theoretical issue: + * + * An application calls mmap(addr, 0), i.e. without MAP_FIXED, where @addr + * is below the border of the 47-bit address space and @addr + @len is + * above the border. + * + * With 4-level paging this request succeeds, but the resulting mapping + * address will always be within the 47-bit virtual address space, because + * the hint address does not result in a valid mapping and is + * ignored. Hence applications which are not prepared to handle virtual + * addresses above 47-bit work correctly. + * + * With 5-level paging this request would be granted and result in a + * mapping which crosses the border of the 47-bit virtual address + * space. If the application cannot handle addresses above 47-bit this + * will lead to misbehaviour and hard to diagnose failures. + * + * Therefore ignore address hints which would result in a mapping crossing + * the 47-bit virtual address boundary. + * + * Note, that in the same scenario with MAP_FIXED the behaviour is + * different. The request with @addr < 47-bit and @addr + @len > 47-bit + * fails on a 4-level paging machine but succeeds on a 5-level paging + * machine. It is reasonable to expect that an application does not rely on + * the failure of such a fixed mapping request, so the restriction is not + * applied. + */ +bool mmap_address_hint_valid(unsigned long addr, unsigned long len) +{ + if (TASK_SIZE - len < addr) + return false; + + return (addr > DEFAULT_MAP_WINDOW) == (addr + len > DEFAULT_MAP_WINDOW); +} + +/* Can we access it for direct reading/writing? Must be RAM: */ +int valid_phys_addr_range(phys_addr_t addr, size_t count) +{ + return addr + count - 1 <= __pa(high_memory - 1); +} + +/* Can we access it through mmap? Must be a valid physical address: */ +int valid_mmap_phys_addr_range(unsigned long pfn, size_t count) +{ + phys_addr_t addr = (phys_addr_t)pfn << PAGE_SHIFT; + + return phys_addr_valid(addr + count - 1); +} + +/* + * Only allow root to set high MMIO mappings to PROT_NONE. + * This prevents an unpriv. user to set them to PROT_NONE and invert + * them, then pointing to valid memory for L1TF speculation. + * + * Note: for locked down kernels may want to disable the root override. + */ +bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot) +{ + if (!boot_cpu_has_bug(X86_BUG_L1TF)) + return true; + if (!__pte_needs_invert(pgprot_val(prot))) + return true; + /* If it's real memory always allow */ + if (pfn_valid(pfn)) + return true; + if (pfn >= l1tf_pfn_limit() && !capable(CAP_SYS_ADMIN)) + return false; + return true; +} diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c new file mode 100644 index 0000000000..c3317f0650 --- /dev/null +++ b/arch/x86/mm/mmio-mod.c @@ -0,0 +1,463 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * + * Copyright (C) IBM Corporation, 2005 + * Jeff Muizelaar, 2006, 2007 + * Pekka Paalanen, 2008 <pq@iki.fi> + * + * Derived from the read-mod example from relay-examples by Tom Zanussi. + */ + +#define pr_fmt(fmt) "mmiotrace: " fmt + +#include <linux/moduleparam.h> +#include <linux/debugfs.h> +#include <linux/slab.h> +#include <linux/uaccess.h> +#include <linux/io.h> +#include <linux/mmiotrace.h> +#include <linux/pgtable.h> +#include <asm/e820/api.h> /* for ISA_START_ADDRESS */ +#include <linux/atomic.h> +#include <linux/percpu.h> +#include <linux/cpu.h> + +#include "pf_in.h" + +struct trap_reason { + unsigned long addr; + unsigned long ip; + enum reason_type type; + int active_traces; +}; + +struct remap_trace { + struct list_head list; + struct kmmio_probe probe; + resource_size_t phys; + unsigned long id; +}; + +/* Accessed per-cpu. */ +static DEFINE_PER_CPU(struct trap_reason, pf_reason); +static DEFINE_PER_CPU(struct mmiotrace_rw, cpu_trace); + +static DEFINE_MUTEX(mmiotrace_mutex); +static DEFINE_SPINLOCK(trace_lock); +static atomic_t mmiotrace_enabled; +static LIST_HEAD(trace_list); /* struct remap_trace */ + +/* + * Locking in this file: + * - mmiotrace_mutex enforces enable/disable_mmiotrace() critical sections. + * - mmiotrace_enabled may be modified only when holding mmiotrace_mutex + * and trace_lock. + * - Routines depending on is_enabled() must take trace_lock. + * - trace_list users must hold trace_lock. + * - is_enabled() guarantees that mmio_trace_{rw,mapping} are allowed. + * - pre/post callbacks assume the effect of is_enabled() being true. + */ + +/* module parameters */ +static unsigned long filter_offset; +static bool nommiotrace; +static bool trace_pc; + +module_param(filter_offset, ulong, 0); +module_param(nommiotrace, bool, 0); +module_param(trace_pc, bool, 0); + +MODULE_PARM_DESC(filter_offset, "Start address of traced mappings."); +MODULE_PARM_DESC(nommiotrace, "Disable actual MMIO tracing."); +MODULE_PARM_DESC(trace_pc, "Record address of faulting instructions."); + +static bool is_enabled(void) +{ + return atomic_read(&mmiotrace_enabled); +} + +static void print_pte(unsigned long address) +{ + unsigned int level; + pte_t *pte = lookup_address(address, &level); + + if (!pte) { + pr_err("Error in %s: no pte for page 0x%08lx\n", + __func__, address); + return; + } + + if (level == PG_LEVEL_2M) { + pr_emerg("4MB pages are not currently supported: 0x%08lx\n", + address); + BUG(); + } + pr_info("pte for 0x%lx: 0x%llx 0x%llx\n", + address, + (unsigned long long)pte_val(*pte), + (unsigned long long)pte_val(*pte) & _PAGE_PRESENT); +} + +/* + * For some reason the pre/post pairs have been called in an + * unmatched order. Report and die. + */ +static void die_kmmio_nesting_error(struct pt_regs *regs, unsigned long addr) +{ + const struct trap_reason *my_reason = &get_cpu_var(pf_reason); + pr_emerg("unexpected fault for address: 0x%08lx, last fault for address: 0x%08lx\n", + addr, my_reason->addr); + print_pte(addr); + pr_emerg("faulting IP is at %pS\n", (void *)regs->ip); + pr_emerg("last faulting IP was at %pS\n", (void *)my_reason->ip); +#ifdef __i386__ + pr_emerg("eax: %08lx ebx: %08lx ecx: %08lx edx: %08lx\n", + regs->ax, regs->bx, regs->cx, regs->dx); + pr_emerg("esi: %08lx edi: %08lx ebp: %08lx esp: %08lx\n", + regs->si, regs->di, regs->bp, regs->sp); +#else + pr_emerg("rax: %016lx rcx: %016lx rdx: %016lx\n", + regs->ax, regs->cx, regs->dx); + pr_emerg("rsi: %016lx rdi: %016lx rbp: %016lx rsp: %016lx\n", + regs->si, regs->di, regs->bp, regs->sp); +#endif + put_cpu_var(pf_reason); + BUG(); +} + +static void pre(struct kmmio_probe *p, struct pt_regs *regs, + unsigned long addr) +{ + struct trap_reason *my_reason = &get_cpu_var(pf_reason); + struct mmiotrace_rw *my_trace = &get_cpu_var(cpu_trace); + const unsigned long instptr = instruction_pointer(regs); + const enum reason_type type = get_ins_type(instptr); + struct remap_trace *trace = p->private; + + /* it doesn't make sense to have more than one active trace per cpu */ + if (my_reason->active_traces) + die_kmmio_nesting_error(regs, addr); + else + my_reason->active_traces++; + + my_reason->type = type; + my_reason->addr = addr; + my_reason->ip = instptr; + + my_trace->phys = addr - trace->probe.addr + trace->phys; + my_trace->map_id = trace->id; + + /* + * Only record the program counter when requested. + * It may taint clean-room reverse engineering. + */ + if (trace_pc) + my_trace->pc = instptr; + else + my_trace->pc = 0; + + /* + * XXX: the timestamp recorded will be *after* the tracing has been + * done, not at the time we hit the instruction. SMP implications + * on event ordering? + */ + + switch (type) { + case REG_READ: + my_trace->opcode = MMIO_READ; + my_trace->width = get_ins_mem_width(instptr); + break; + case REG_WRITE: + my_trace->opcode = MMIO_WRITE; + my_trace->width = get_ins_mem_width(instptr); + my_trace->value = get_ins_reg_val(instptr, regs); + break; + case IMM_WRITE: + my_trace->opcode = MMIO_WRITE; + my_trace->width = get_ins_mem_width(instptr); + my_trace->value = get_ins_imm_val(instptr); + break; + default: + { + unsigned char *ip = (unsigned char *)instptr; + my_trace->opcode = MMIO_UNKNOWN_OP; + my_trace->width = 0; + my_trace->value = (*ip) << 16 | *(ip + 1) << 8 | + *(ip + 2); + } + } + put_cpu_var(cpu_trace); + put_cpu_var(pf_reason); +} + +static void post(struct kmmio_probe *p, unsigned long condition, + struct pt_regs *regs) +{ + struct trap_reason *my_reason = &get_cpu_var(pf_reason); + struct mmiotrace_rw *my_trace = &get_cpu_var(cpu_trace); + + /* this should always return the active_trace count to 0 */ + my_reason->active_traces--; + if (my_reason->active_traces) { + pr_emerg("unexpected post handler"); + BUG(); + } + + switch (my_reason->type) { + case REG_READ: + my_trace->value = get_ins_reg_val(my_reason->ip, regs); + break; + default: + break; + } + + mmio_trace_rw(my_trace); + put_cpu_var(cpu_trace); + put_cpu_var(pf_reason); +} + +static void ioremap_trace_core(resource_size_t offset, unsigned long size, + void __iomem *addr) +{ + static atomic_t next_id; + struct remap_trace *trace = kmalloc(sizeof(*trace), GFP_KERNEL); + /* These are page-unaligned. */ + struct mmiotrace_map map = { + .phys = offset, + .virt = (unsigned long)addr, + .len = size, + .opcode = MMIO_PROBE + }; + + if (!trace) { + pr_err("kmalloc failed in ioremap\n"); + return; + } + + *trace = (struct remap_trace) { + .probe = { + .addr = (unsigned long)addr, + .len = size, + .pre_handler = pre, + .post_handler = post, + .private = trace + }, + .phys = offset, + .id = atomic_inc_return(&next_id) + }; + map.map_id = trace->id; + + spin_lock_irq(&trace_lock); + if (!is_enabled()) { + kfree(trace); + goto not_enabled; + } + + mmio_trace_mapping(&map); + list_add_tail(&trace->list, &trace_list); + if (!nommiotrace) + register_kmmio_probe(&trace->probe); + +not_enabled: + spin_unlock_irq(&trace_lock); +} + +void mmiotrace_ioremap(resource_size_t offset, unsigned long size, + void __iomem *addr) +{ + if (!is_enabled()) /* recheck and proper locking in *_core() */ + return; + + pr_debug("ioremap_*(0x%llx, 0x%lx) = %p\n", + (unsigned long long)offset, size, addr); + if ((filter_offset) && (offset != filter_offset)) + return; + ioremap_trace_core(offset, size, addr); +} + +static void iounmap_trace_core(volatile void __iomem *addr) +{ + struct mmiotrace_map map = { + .phys = 0, + .virt = (unsigned long)addr, + .len = 0, + .opcode = MMIO_UNPROBE + }; + struct remap_trace *trace; + struct remap_trace *tmp; + struct remap_trace *found_trace = NULL; + + pr_debug("Unmapping %p.\n", addr); + + spin_lock_irq(&trace_lock); + if (!is_enabled()) + goto not_enabled; + + list_for_each_entry_safe(trace, tmp, &trace_list, list) { + if ((unsigned long)addr == trace->probe.addr) { + if (!nommiotrace) + unregister_kmmio_probe(&trace->probe); + list_del(&trace->list); + found_trace = trace; + break; + } + } + map.map_id = (found_trace) ? found_trace->id : -1; + mmio_trace_mapping(&map); + +not_enabled: + spin_unlock_irq(&trace_lock); + if (found_trace) { + synchronize_rcu(); /* unregister_kmmio_probe() requirement */ + kfree(found_trace); + } +} + +void mmiotrace_iounmap(volatile void __iomem *addr) +{ + might_sleep(); + if (is_enabled()) /* recheck and proper locking in *_core() */ + iounmap_trace_core(addr); +} + +int mmiotrace_printk(const char *fmt, ...) +{ + int ret = 0; + va_list args; + unsigned long flags; + va_start(args, fmt); + + spin_lock_irqsave(&trace_lock, flags); + if (is_enabled()) + ret = mmio_trace_printk(fmt, args); + spin_unlock_irqrestore(&trace_lock, flags); + + va_end(args); + return ret; +} +EXPORT_SYMBOL(mmiotrace_printk); + +static void clear_trace_list(void) +{ + struct remap_trace *trace; + struct remap_trace *tmp; + + /* + * No locking required, because the caller ensures we are in a + * critical section via mutex, and is_enabled() is false, + * i.e. nothing can traverse or modify this list. + * Caller also ensures is_enabled() cannot change. + */ + list_for_each_entry(trace, &trace_list, list) { + pr_notice("purging non-iounmapped trace @0x%08lx, size 0x%lx.\n", + trace->probe.addr, trace->probe.len); + if (!nommiotrace) + unregister_kmmio_probe(&trace->probe); + } + synchronize_rcu(); /* unregister_kmmio_probe() requirement */ + + list_for_each_entry_safe(trace, tmp, &trace_list, list) { + list_del(&trace->list); + kfree(trace); + } +} + +#ifdef CONFIG_HOTPLUG_CPU +static cpumask_var_t downed_cpus; + +static void enter_uniprocessor(void) +{ + int cpu; + int err; + + if (!cpumask_available(downed_cpus) && + !alloc_cpumask_var(&downed_cpus, GFP_KERNEL)) { + pr_notice("Failed to allocate mask\n"); + goto out; + } + + cpus_read_lock(); + cpumask_copy(downed_cpus, cpu_online_mask); + cpumask_clear_cpu(cpumask_first(cpu_online_mask), downed_cpus); + if (num_online_cpus() > 1) + pr_notice("Disabling non-boot CPUs...\n"); + cpus_read_unlock(); + + for_each_cpu(cpu, downed_cpus) { + err = remove_cpu(cpu); + if (!err) + pr_info("CPU%d is down.\n", cpu); + else + pr_err("Error taking CPU%d down: %d\n", cpu, err); + } +out: + if (num_online_cpus() > 1) + pr_warn("multiple CPUs still online, may miss events.\n"); +} + +static void leave_uniprocessor(void) +{ + int cpu; + int err; + + if (!cpumask_available(downed_cpus) || cpumask_empty(downed_cpus)) + return; + pr_notice("Re-enabling CPUs...\n"); + for_each_cpu(cpu, downed_cpus) { + err = add_cpu(cpu); + if (!err) + pr_info("enabled CPU%d.\n", cpu); + else + pr_err("cannot re-enable CPU%d: %d\n", cpu, err); + } +} + +#else /* !CONFIG_HOTPLUG_CPU */ +static void enter_uniprocessor(void) +{ + if (num_online_cpus() > 1) + pr_warn("multiple CPUs are online, may miss events. " + "Suggest booting with maxcpus=1 kernel argument.\n"); +} + +static void leave_uniprocessor(void) +{ +} +#endif + +void enable_mmiotrace(void) +{ + mutex_lock(&mmiotrace_mutex); + if (is_enabled()) + goto out; + + if (nommiotrace) + pr_info("MMIO tracing disabled.\n"); + kmmio_init(); + enter_uniprocessor(); + spin_lock_irq(&trace_lock); + atomic_inc(&mmiotrace_enabled); + spin_unlock_irq(&trace_lock); + pr_info("enabled.\n"); +out: + mutex_unlock(&mmiotrace_mutex); +} + +void disable_mmiotrace(void) +{ + mutex_lock(&mmiotrace_mutex); + if (!is_enabled()) + goto out; + + spin_lock_irq(&trace_lock); + atomic_dec(&mmiotrace_enabled); + BUG_ON(is_enabled()); + spin_unlock_irq(&trace_lock); + + clear_trace_list(); /* guarantees: no more kmmio callbacks */ + leave_uniprocessor(); + kmmio_cleanup(); + pr_info("disabled.\n"); +out: + mutex_unlock(&mmiotrace_mutex); +} diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c new file mode 100644 index 0000000000..aa39d678fe --- /dev/null +++ b/arch/x86/mm/numa.c @@ -0,0 +1,1037 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Common code for 32 and 64-bit NUMA */ +#include <linux/acpi.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/string.h> +#include <linux/init.h> +#include <linux/memblock.h> +#include <linux/mmzone.h> +#include <linux/ctype.h> +#include <linux/nodemask.h> +#include <linux/sched.h> +#include <linux/topology.h> +#include <linux/sort.h> + +#include <asm/e820/api.h> +#include <asm/proto.h> +#include <asm/dma.h> +#include <asm/amd_nb.h> + +#include "numa_internal.h" + +int numa_off; +nodemask_t numa_nodes_parsed __initdata; + +struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; +EXPORT_SYMBOL(node_data); + +static struct numa_meminfo numa_meminfo __initdata_or_meminfo; +static struct numa_meminfo numa_reserved_meminfo __initdata_or_meminfo; + +static int numa_distance_cnt; +static u8 *numa_distance; + +static __init int numa_setup(char *opt) +{ + if (!opt) + return -EINVAL; + if (!strncmp(opt, "off", 3)) + numa_off = 1; + if (!strncmp(opt, "fake=", 5)) + return numa_emu_cmdline(opt + 5); + if (!strncmp(opt, "noacpi", 6)) + disable_srat(); + if (!strncmp(opt, "nohmat", 6)) + disable_hmat(); + return 0; +} +early_param("numa", numa_setup); + +/* + * apicid, cpu, node mappings + */ +s16 __apicid_to_node[MAX_LOCAL_APIC] = { + [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE +}; + +int numa_cpu_node(int cpu) +{ + int apicid = early_per_cpu(x86_cpu_to_apicid, cpu); + + if (apicid != BAD_APICID) + return __apicid_to_node[apicid]; + return NUMA_NO_NODE; +} + +cpumask_var_t node_to_cpumask_map[MAX_NUMNODES]; +EXPORT_SYMBOL(node_to_cpumask_map); + +/* + * Map cpu index to node index + */ +DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE); +EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map); + +void numa_set_node(int cpu, int node) +{ + int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map); + + /* early setting, no percpu area yet */ + if (cpu_to_node_map) { + cpu_to_node_map[cpu] = node; + return; + } + +#ifdef CONFIG_DEBUG_PER_CPU_MAPS + if (cpu >= nr_cpu_ids || !cpu_possible(cpu)) { + printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu); + dump_stack(); + return; + } +#endif + per_cpu(x86_cpu_to_node_map, cpu) = node; + + set_cpu_numa_node(cpu, node); +} + +void numa_clear_node(int cpu) +{ + numa_set_node(cpu, NUMA_NO_NODE); +} + +/* + * Allocate node_to_cpumask_map based on number of available nodes + * Requires node_possible_map to be valid. + * + * Note: cpumask_of_node() is not valid until after this is done. + * (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.) + */ +void __init setup_node_to_cpumask_map(void) +{ + unsigned int node; + + /* setup nr_node_ids if not done yet */ + if (nr_node_ids == MAX_NUMNODES) + setup_nr_node_ids(); + + /* allocate the map */ + for (node = 0; node < nr_node_ids; node++) + alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]); + + /* cpumask_of_node() will now work */ + pr_debug("Node to cpumask map for %u nodes\n", nr_node_ids); +} + +static int __init numa_add_memblk_to(int nid, u64 start, u64 end, + struct numa_meminfo *mi) +{ + /* ignore zero length blks */ + if (start == end) + return 0; + + /* whine about and ignore invalid blks */ + if (start > end || nid < 0 || nid >= MAX_NUMNODES) { + pr_warn("Warning: invalid memblk node %d [mem %#010Lx-%#010Lx]\n", + nid, start, end - 1); + return 0; + } + + if (mi->nr_blks >= NR_NODE_MEMBLKS) { + pr_err("too many memblk ranges\n"); + return -EINVAL; + } + + mi->blk[mi->nr_blks].start = start; + mi->blk[mi->nr_blks].end = end; + mi->blk[mi->nr_blks].nid = nid; + mi->nr_blks++; + return 0; +} + +/** + * numa_remove_memblk_from - Remove one numa_memblk from a numa_meminfo + * @idx: Index of memblk to remove + * @mi: numa_meminfo to remove memblk from + * + * Remove @idx'th numa_memblk from @mi by shifting @mi->blk[] and + * decrementing @mi->nr_blks. + */ +void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi) +{ + mi->nr_blks--; + memmove(&mi->blk[idx], &mi->blk[idx + 1], + (mi->nr_blks - idx) * sizeof(mi->blk[0])); +} + +/** + * numa_move_tail_memblk - Move a numa_memblk from one numa_meminfo to another + * @dst: numa_meminfo to append block to + * @idx: Index of memblk to remove + * @src: numa_meminfo to remove memblk from + */ +static void __init numa_move_tail_memblk(struct numa_meminfo *dst, int idx, + struct numa_meminfo *src) +{ + dst->blk[dst->nr_blks++] = src->blk[idx]; + numa_remove_memblk_from(idx, src); +} + +/** + * numa_add_memblk - Add one numa_memblk to numa_meminfo + * @nid: NUMA node ID of the new memblk + * @start: Start address of the new memblk + * @end: End address of the new memblk + * + * Add a new memblk to the default numa_meminfo. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +int __init numa_add_memblk(int nid, u64 start, u64 end) +{ + return numa_add_memblk_to(nid, start, end, &numa_meminfo); +} + +/* Allocate NODE_DATA for a node on the local memory */ +static void __init alloc_node_data(int nid) +{ + const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE); + u64 nd_pa; + void *nd; + int tnid; + + /* + * Allocate node data. Try node-local memory and then any node. + * Never allocate in DMA zone. + */ + nd_pa = memblock_phys_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid); + if (!nd_pa) { + pr_err("Cannot find %zu bytes in any node (initial node: %d)\n", + nd_size, nid); + return; + } + nd = __va(nd_pa); + + /* report and initialize */ + printk(KERN_INFO "NODE_DATA(%d) allocated [mem %#010Lx-%#010Lx]\n", nid, + nd_pa, nd_pa + nd_size - 1); + tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT); + if (tnid != nid) + printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nid, tnid); + + node_data[nid] = nd; + memset(NODE_DATA(nid), 0, sizeof(pg_data_t)); + + node_set_online(nid); +} + +/** + * numa_cleanup_meminfo - Cleanup a numa_meminfo + * @mi: numa_meminfo to clean up + * + * Sanitize @mi by merging and removing unnecessary memblks. Also check for + * conflicts and clear unused memblks. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +int __init numa_cleanup_meminfo(struct numa_meminfo *mi) +{ + const u64 low = 0; + const u64 high = PFN_PHYS(max_pfn); + int i, j, k; + + /* first, trim all entries */ + for (i = 0; i < mi->nr_blks; i++) { + struct numa_memblk *bi = &mi->blk[i]; + + /* move / save reserved memory ranges */ + if (!memblock_overlaps_region(&memblock.memory, + bi->start, bi->end - bi->start)) { + numa_move_tail_memblk(&numa_reserved_meminfo, i--, mi); + continue; + } + + /* make sure all non-reserved blocks are inside the limits */ + bi->start = max(bi->start, low); + + /* preserve info for non-RAM areas above 'max_pfn': */ + if (bi->end > high) { + numa_add_memblk_to(bi->nid, high, bi->end, + &numa_reserved_meminfo); + bi->end = high; + } + + /* and there's no empty block */ + if (bi->start >= bi->end) + numa_remove_memblk_from(i--, mi); + } + + /* merge neighboring / overlapping entries */ + for (i = 0; i < mi->nr_blks; i++) { + struct numa_memblk *bi = &mi->blk[i]; + + for (j = i + 1; j < mi->nr_blks; j++) { + struct numa_memblk *bj = &mi->blk[j]; + u64 start, end; + + /* + * See whether there are overlapping blocks. Whine + * about but allow overlaps of the same nid. They + * will be merged below. + */ + if (bi->end > bj->start && bi->start < bj->end) { + if (bi->nid != bj->nid) { + pr_err("node %d [mem %#010Lx-%#010Lx] overlaps with node %d [mem %#010Lx-%#010Lx]\n", + bi->nid, bi->start, bi->end - 1, + bj->nid, bj->start, bj->end - 1); + return -EINVAL; + } + pr_warn("Warning: node %d [mem %#010Lx-%#010Lx] overlaps with itself [mem %#010Lx-%#010Lx]\n", + bi->nid, bi->start, bi->end - 1, + bj->start, bj->end - 1); + } + + /* + * Join together blocks on the same node, holes + * between which don't overlap with memory on other + * nodes. + */ + if (bi->nid != bj->nid) + continue; + start = min(bi->start, bj->start); + end = max(bi->end, bj->end); + for (k = 0; k < mi->nr_blks; k++) { + struct numa_memblk *bk = &mi->blk[k]; + + if (bi->nid == bk->nid) + continue; + if (start < bk->end && end > bk->start) + break; + } + if (k < mi->nr_blks) + continue; + printk(KERN_INFO "NUMA: Node %d [mem %#010Lx-%#010Lx] + [mem %#010Lx-%#010Lx] -> [mem %#010Lx-%#010Lx]\n", + bi->nid, bi->start, bi->end - 1, bj->start, + bj->end - 1, start, end - 1); + bi->start = start; + bi->end = end; + numa_remove_memblk_from(j--, mi); + } + } + + /* clear unused ones */ + for (i = mi->nr_blks; i < ARRAY_SIZE(mi->blk); i++) { + mi->blk[i].start = mi->blk[i].end = 0; + mi->blk[i].nid = NUMA_NO_NODE; + } + + return 0; +} + +/* + * Set nodes, which have memory in @mi, in *@nodemask. + */ +static void __init numa_nodemask_from_meminfo(nodemask_t *nodemask, + const struct numa_meminfo *mi) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(mi->blk); i++) + if (mi->blk[i].start != mi->blk[i].end && + mi->blk[i].nid != NUMA_NO_NODE) + node_set(mi->blk[i].nid, *nodemask); +} + +/** + * numa_reset_distance - Reset NUMA distance table + * + * The current table is freed. The next numa_set_distance() call will + * create a new one. + */ +void __init numa_reset_distance(void) +{ + size_t size = numa_distance_cnt * numa_distance_cnt * sizeof(numa_distance[0]); + + /* numa_distance could be 1LU marking allocation failure, test cnt */ + if (numa_distance_cnt) + memblock_free(numa_distance, size); + numa_distance_cnt = 0; + numa_distance = NULL; /* enable table creation */ +} + +static int __init numa_alloc_distance(void) +{ + nodemask_t nodes_parsed; + size_t size; + int i, j, cnt = 0; + u64 phys; + + /* size the new table and allocate it */ + nodes_parsed = numa_nodes_parsed; + numa_nodemask_from_meminfo(&nodes_parsed, &numa_meminfo); + + for_each_node_mask(i, nodes_parsed) + cnt = i; + cnt++; + size = cnt * cnt * sizeof(numa_distance[0]); + + phys = memblock_phys_alloc_range(size, PAGE_SIZE, 0, + PFN_PHYS(max_pfn_mapped)); + if (!phys) { + pr_warn("Warning: can't allocate distance table!\n"); + /* don't retry until explicitly reset */ + numa_distance = (void *)1LU; + return -ENOMEM; + } + + numa_distance = __va(phys); + numa_distance_cnt = cnt; + + /* fill with the default distances */ + for (i = 0; i < cnt; i++) + for (j = 0; j < cnt; j++) + numa_distance[i * cnt + j] = i == j ? + LOCAL_DISTANCE : REMOTE_DISTANCE; + printk(KERN_DEBUG "NUMA: Initialized distance table, cnt=%d\n", cnt); + + return 0; +} + +/** + * numa_set_distance - Set NUMA distance from one NUMA to another + * @from: the 'from' node to set distance + * @to: the 'to' node to set distance + * @distance: NUMA distance + * + * Set the distance from node @from to @to to @distance. If distance table + * doesn't exist, one which is large enough to accommodate all the currently + * known nodes will be created. + * + * If such table cannot be allocated, a warning is printed and further + * calls are ignored until the distance table is reset with + * numa_reset_distance(). + * + * If @from or @to is higher than the highest known node or lower than zero + * at the time of table creation or @distance doesn't make sense, the call + * is ignored. + * This is to allow simplification of specific NUMA config implementations. + */ +void __init numa_set_distance(int from, int to, int distance) +{ + if (!numa_distance && numa_alloc_distance() < 0) + return; + + if (from >= numa_distance_cnt || to >= numa_distance_cnt || + from < 0 || to < 0) { + pr_warn_once("Warning: node ids are out of bound, from=%d to=%d distance=%d\n", + from, to, distance); + return; + } + + if ((u8)distance != distance || + (from == to && distance != LOCAL_DISTANCE)) { + pr_warn_once("Warning: invalid distance parameter, from=%d to=%d distance=%d\n", + from, to, distance); + return; + } + + numa_distance[from * numa_distance_cnt + to] = distance; +} + +int __node_distance(int from, int to) +{ + if (from >= numa_distance_cnt || to >= numa_distance_cnt) + return from == to ? LOCAL_DISTANCE : REMOTE_DISTANCE; + return numa_distance[from * numa_distance_cnt + to]; +} +EXPORT_SYMBOL(__node_distance); + +/* + * Sanity check to catch more bad NUMA configurations (they are amazingly + * common). Make sure the nodes cover all memory. + */ +static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi) +{ + u64 numaram, e820ram; + int i; + + numaram = 0; + for (i = 0; i < mi->nr_blks; i++) { + u64 s = mi->blk[i].start >> PAGE_SHIFT; + u64 e = mi->blk[i].end >> PAGE_SHIFT; + numaram += e - s; + numaram -= __absent_pages_in_range(mi->blk[i].nid, s, e); + if ((s64)numaram < 0) + numaram = 0; + } + + e820ram = max_pfn - absent_pages_in_range(0, max_pfn); + + /* We seem to lose 3 pages somewhere. Allow 1M of slack. */ + if ((s64)(e820ram - numaram) >= (1 << (20 - PAGE_SHIFT))) { + printk(KERN_ERR "NUMA: nodes only cover %LuMB of your %LuMB e820 RAM. Not used.\n", + (numaram << PAGE_SHIFT) >> 20, + (e820ram << PAGE_SHIFT) >> 20); + return false; + } + return true; +} + +/* + * Mark all currently memblock-reserved physical memory (which covers the + * kernel's own memory ranges) as hot-unswappable. + */ +static void __init numa_clear_kernel_node_hotplug(void) +{ + nodemask_t reserved_nodemask = NODE_MASK_NONE; + struct memblock_region *mb_region; + int i; + + /* + * We have to do some preprocessing of memblock regions, to + * make them suitable for reservation. + * + * At this time, all memory regions reserved by memblock are + * used by the kernel, but those regions are not split up + * along node boundaries yet, and don't necessarily have their + * node ID set yet either. + * + * So iterate over all memory known to the x86 architecture, + * and use those ranges to set the nid in memblock.reserved. + * This will split up the memblock regions along node + * boundaries and will set the node IDs as well. + */ + for (i = 0; i < numa_meminfo.nr_blks; i++) { + struct numa_memblk *mb = numa_meminfo.blk + i; + int ret; + + ret = memblock_set_node(mb->start, mb->end - mb->start, &memblock.reserved, mb->nid); + WARN_ON_ONCE(ret); + } + + /* + * Now go over all reserved memblock regions, to construct a + * node mask of all kernel reserved memory areas. + * + * [ Note, when booting with mem=nn[kMG] or in a kdump kernel, + * numa_meminfo might not include all memblock.reserved + * memory ranges, because quirks such as trim_snb_memory() + * reserve specific pages for Sandy Bridge graphics. ] + */ + for_each_reserved_mem_region(mb_region) { + int nid = memblock_get_region_node(mb_region); + + if (nid != MAX_NUMNODES) + node_set(nid, reserved_nodemask); + } + + /* + * Finally, clear the MEMBLOCK_HOTPLUG flag for all memory + * belonging to the reserved node mask. + * + * Note that this will include memory regions that reside + * on nodes that contain kernel memory - entire nodes + * become hot-unpluggable: + */ + for (i = 0; i < numa_meminfo.nr_blks; i++) { + struct numa_memblk *mb = numa_meminfo.blk + i; + + if (!node_isset(mb->nid, reserved_nodemask)) + continue; + + memblock_clear_hotplug(mb->start, mb->end - mb->start); + } +} + +static int __init numa_register_memblks(struct numa_meminfo *mi) +{ + int i, nid; + + /* Account for nodes with cpus and no memory */ + node_possible_map = numa_nodes_parsed; + numa_nodemask_from_meminfo(&node_possible_map, mi); + if (WARN_ON(nodes_empty(node_possible_map))) + return -EINVAL; + + for (i = 0; i < mi->nr_blks; i++) { + struct numa_memblk *mb = &mi->blk[i]; + memblock_set_node(mb->start, mb->end - mb->start, + &memblock.memory, mb->nid); + } + + /* + * At very early time, the kernel have to use some memory such as + * loading the kernel image. We cannot prevent this anyway. So any + * node the kernel resides in should be un-hotpluggable. + * + * And when we come here, alloc node data won't fail. + */ + numa_clear_kernel_node_hotplug(); + + /* + * If sections array is gonna be used for pfn -> nid mapping, check + * whether its granularity is fine enough. + */ + if (IS_ENABLED(NODE_NOT_IN_PAGE_FLAGS)) { + unsigned long pfn_align = node_map_pfn_alignment(); + + if (pfn_align && pfn_align < PAGES_PER_SECTION) { + pr_warn("Node alignment %LuMB < min %LuMB, rejecting NUMA config\n", + PFN_PHYS(pfn_align) >> 20, + PFN_PHYS(PAGES_PER_SECTION) >> 20); + return -EINVAL; + } + } + if (!numa_meminfo_cover_memory(mi)) + return -EINVAL; + + /* Finally register nodes. */ + for_each_node_mask(nid, node_possible_map) { + u64 start = PFN_PHYS(max_pfn); + u64 end = 0; + + for (i = 0; i < mi->nr_blks; i++) { + if (nid != mi->blk[i].nid) + continue; + start = min(mi->blk[i].start, start); + end = max(mi->blk[i].end, end); + } + + if (start >= end) + continue; + + alloc_node_data(nid); + } + + /* Dump memblock with node info and return. */ + memblock_dump_all(); + return 0; +} + +/* + * There are unfortunately some poorly designed mainboards around that + * only connect memory to a single CPU. This breaks the 1:1 cpu->node + * mapping. To avoid this fill in the mapping for all possible CPUs, + * as the number of CPUs is not known yet. We round robin the existing + * nodes. + */ +static void __init numa_init_array(void) +{ + int rr, i; + + rr = first_node(node_online_map); + for (i = 0; i < nr_cpu_ids; i++) { + if (early_cpu_to_node(i) != NUMA_NO_NODE) + continue; + numa_set_node(i, rr); + rr = next_node_in(rr, node_online_map); + } +} + +static int __init numa_init(int (*init_func)(void)) +{ + int i; + int ret; + + for (i = 0; i < MAX_LOCAL_APIC; i++) + set_apicid_to_node(i, NUMA_NO_NODE); + + nodes_clear(numa_nodes_parsed); + nodes_clear(node_possible_map); + nodes_clear(node_online_map); + memset(&numa_meminfo, 0, sizeof(numa_meminfo)); + WARN_ON(memblock_set_node(0, ULLONG_MAX, &memblock.memory, + MAX_NUMNODES)); + WARN_ON(memblock_set_node(0, ULLONG_MAX, &memblock.reserved, + MAX_NUMNODES)); + /* In case that parsing SRAT failed. */ + WARN_ON(memblock_clear_hotplug(0, ULLONG_MAX)); + numa_reset_distance(); + + ret = init_func(); + if (ret < 0) + return ret; + + /* + * We reset memblock back to the top-down direction + * here because if we configured ACPI_NUMA, we have + * parsed SRAT in init_func(). It is ok to have the + * reset here even if we did't configure ACPI_NUMA + * or acpi numa init fails and fallbacks to dummy + * numa init. + */ + memblock_set_bottom_up(false); + + ret = numa_cleanup_meminfo(&numa_meminfo); + if (ret < 0) + return ret; + + numa_emulation(&numa_meminfo, numa_distance_cnt); + + ret = numa_register_memblks(&numa_meminfo); + if (ret < 0) + return ret; + + for (i = 0; i < nr_cpu_ids; i++) { + int nid = early_cpu_to_node(i); + + if (nid == NUMA_NO_NODE) + continue; + if (!node_online(nid)) + numa_clear_node(i); + } + numa_init_array(); + + return 0; +} + +/** + * dummy_numa_init - Fallback dummy NUMA init + * + * Used if there's no underlying NUMA architecture, NUMA initialization + * fails, or NUMA is disabled on the command line. + * + * Must online at least one node and add memory blocks that cover all + * allowed memory. This function must not fail. + */ +static int __init dummy_numa_init(void) +{ + printk(KERN_INFO "%s\n", + numa_off ? "NUMA turned off" : "No NUMA configuration found"); + printk(KERN_INFO "Faking a node at [mem %#018Lx-%#018Lx]\n", + 0LLU, PFN_PHYS(max_pfn) - 1); + + node_set(0, numa_nodes_parsed); + numa_add_memblk(0, 0, PFN_PHYS(max_pfn)); + + return 0; +} + +/** + * x86_numa_init - Initialize NUMA + * + * Try each configured NUMA initialization method until one succeeds. The + * last fallback is dummy single node config encompassing whole memory and + * never fails. + */ +void __init x86_numa_init(void) +{ + if (!numa_off) { +#ifdef CONFIG_ACPI_NUMA + if (!numa_init(x86_acpi_numa_init)) + return; +#endif +#ifdef CONFIG_AMD_NUMA + if (!numa_init(amd_numa_init)) + return; +#endif + } + + numa_init(dummy_numa_init); +} + + +/* + * A node may exist which has one or more Generic Initiators but no CPUs and no + * memory. + * + * This function must be called after init_cpu_to_node(), to ensure that any + * memoryless CPU nodes have already been brought online, and before the + * node_data[nid] is needed for zone list setup in build_all_zonelists(). + * + * When this function is called, any nodes containing either memory and/or CPUs + * will already be online and there is no need to do anything extra, even if + * they also contain one or more Generic Initiators. + */ +void __init init_gi_nodes(void) +{ + int nid; + + /* + * Exclude this node from + * bringup_nonboot_cpus + * cpu_up + * __try_online_node + * register_one_node + * because node_subsys is not initialized yet. + * TODO remove dependency on node_online + */ + for_each_node_state(nid, N_GENERIC_INITIATOR) + if (!node_online(nid)) + node_set_online(nid); +} + +/* + * Setup early cpu_to_node. + * + * Populate cpu_to_node[] only if x86_cpu_to_apicid[], + * and apicid_to_node[] tables have valid entries for a CPU. + * This means we skip cpu_to_node[] initialisation for NUMA + * emulation and faking node case (when running a kernel compiled + * for NUMA on a non NUMA box), which is OK as cpu_to_node[] + * is already initialized in a round robin manner at numa_init_array, + * prior to this call, and this initialization is good enough + * for the fake NUMA cases. + * + * Called before the per_cpu areas are setup. + */ +void __init init_cpu_to_node(void) +{ + int cpu; + u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid); + + BUG_ON(cpu_to_apicid == NULL); + + for_each_possible_cpu(cpu) { + int node = numa_cpu_node(cpu); + + if (node == NUMA_NO_NODE) + continue; + + /* + * Exclude this node from + * bringup_nonboot_cpus + * cpu_up + * __try_online_node + * register_one_node + * because node_subsys is not initialized yet. + * TODO remove dependency on node_online + */ + if (!node_online(node)) + node_set_online(node); + + numa_set_node(cpu, node); + } +} + +#ifndef CONFIG_DEBUG_PER_CPU_MAPS + +# ifndef CONFIG_NUMA_EMU +void numa_add_cpu(int cpu) +{ + cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); +} + +void numa_remove_cpu(int cpu) +{ + cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); +} +# endif /* !CONFIG_NUMA_EMU */ + +#else /* !CONFIG_DEBUG_PER_CPU_MAPS */ + +int __cpu_to_node(int cpu) +{ + if (early_per_cpu_ptr(x86_cpu_to_node_map)) { + printk(KERN_WARNING + "cpu_to_node(%d): usage too early!\n", cpu); + dump_stack(); + return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; + } + return per_cpu(x86_cpu_to_node_map, cpu); +} +EXPORT_SYMBOL(__cpu_to_node); + +/* + * Same function as cpu_to_node() but used if called before the + * per_cpu areas are setup. + */ +int early_cpu_to_node(int cpu) +{ + if (early_per_cpu_ptr(x86_cpu_to_node_map)) + return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; + + if (!cpu_possible(cpu)) { + printk(KERN_WARNING + "early_cpu_to_node(%d): no per_cpu area!\n", cpu); + dump_stack(); + return NUMA_NO_NODE; + } + return per_cpu(x86_cpu_to_node_map, cpu); +} + +void debug_cpumask_set_cpu(int cpu, int node, bool enable) +{ + struct cpumask *mask; + + if (node == NUMA_NO_NODE) { + /* early_cpu_to_node() already emits a warning and trace */ + return; + } + mask = node_to_cpumask_map[node]; + if (!cpumask_available(mask)) { + pr_err("node_to_cpumask_map[%i] NULL\n", node); + dump_stack(); + return; + } + + if (enable) + cpumask_set_cpu(cpu, mask); + else + cpumask_clear_cpu(cpu, mask); + + printk(KERN_DEBUG "%s cpu %d node %d: mask now %*pbl\n", + enable ? "numa_add_cpu" : "numa_remove_cpu", + cpu, node, cpumask_pr_args(mask)); + return; +} + +# ifndef CONFIG_NUMA_EMU +static void numa_set_cpumask(int cpu, bool enable) +{ + debug_cpumask_set_cpu(cpu, early_cpu_to_node(cpu), enable); +} + +void numa_add_cpu(int cpu) +{ + numa_set_cpumask(cpu, true); +} + +void numa_remove_cpu(int cpu) +{ + numa_set_cpumask(cpu, false); +} +# endif /* !CONFIG_NUMA_EMU */ + +/* + * Returns a pointer to the bitmask of CPUs on Node 'node'. + */ +const struct cpumask *cpumask_of_node(int node) +{ + if ((unsigned)node >= nr_node_ids) { + printk(KERN_WARNING + "cpumask_of_node(%d): (unsigned)node >= nr_node_ids(%u)\n", + node, nr_node_ids); + dump_stack(); + return cpu_none_mask; + } + if (!cpumask_available(node_to_cpumask_map[node])) { + printk(KERN_WARNING + "cpumask_of_node(%d): no node_to_cpumask_map!\n", + node); + dump_stack(); + return cpu_online_mask; + } + return node_to_cpumask_map[node]; +} +EXPORT_SYMBOL(cpumask_of_node); + +#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ + +#ifdef CONFIG_NUMA_KEEP_MEMINFO +static int meminfo_to_nid(struct numa_meminfo *mi, u64 start) +{ + int i; + + for (i = 0; i < mi->nr_blks; i++) + if (mi->blk[i].start <= start && mi->blk[i].end > start) + return mi->blk[i].nid; + return NUMA_NO_NODE; +} + +int phys_to_target_node(phys_addr_t start) +{ + int nid = meminfo_to_nid(&numa_meminfo, start); + + /* + * Prefer online nodes, but if reserved memory might be + * hot-added continue the search with reserved ranges. + */ + if (nid != NUMA_NO_NODE) + return nid; + + return meminfo_to_nid(&numa_reserved_meminfo, start); +} +EXPORT_SYMBOL_GPL(phys_to_target_node); + +int memory_add_physaddr_to_nid(u64 start) +{ + int nid = meminfo_to_nid(&numa_meminfo, start); + + if (nid == NUMA_NO_NODE) + nid = numa_meminfo.blk[0].nid; + return nid; +} +EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); + +static int __init cmp_memblk(const void *a, const void *b) +{ + const struct numa_memblk *ma = *(const struct numa_memblk **)a; + const struct numa_memblk *mb = *(const struct numa_memblk **)b; + + return ma->start - mb->start; +} + +static struct numa_memblk *numa_memblk_list[NR_NODE_MEMBLKS] __initdata; + +/** + * numa_fill_memblks - Fill gaps in numa_meminfo memblks + * @start: address to begin fill + * @end: address to end fill + * + * Find and extend numa_meminfo memblks to cover the @start-@end + * physical address range, such that the first memblk includes + * @start, the last memblk includes @end, and any gaps in between + * are filled. + * + * RETURNS: + * 0 : Success + * NUMA_NO_MEMBLK : No memblk exists in @start-@end range + */ + +int __init numa_fill_memblks(u64 start, u64 end) +{ + struct numa_memblk **blk = &numa_memblk_list[0]; + struct numa_meminfo *mi = &numa_meminfo; + int count = 0; + u64 prev_end; + + /* + * Create a list of pointers to numa_meminfo memblks that + * overlap start, end. Exclude (start == bi->end) since + * end addresses in both a CFMWS range and a memblk range + * are exclusive. + * + * This list of pointers is used to make in-place changes + * that fill out the numa_meminfo memblks. + */ + for (int i = 0; i < mi->nr_blks; i++) { + struct numa_memblk *bi = &mi->blk[i]; + + if (start < bi->end && end >= bi->start) { + blk[count] = &mi->blk[i]; + count++; + } + } + if (!count) + return NUMA_NO_MEMBLK; + + /* Sort the list of pointers in memblk->start order */ + sort(&blk[0], count, sizeof(blk[0]), cmp_memblk, NULL); + + /* Make sure the first/last memblks include start/end */ + blk[0]->start = min(blk[0]->start, start); + blk[count - 1]->end = max(blk[count - 1]->end, end); + + /* + * Fill any gaps by tracking the previous memblks + * end address and backfilling to it if needed. + */ + prev_end = blk[0]->end; + for (int i = 1; i < count; i++) { + struct numa_memblk *curr = blk[i]; + + if (prev_end >= curr->start) { + if (prev_end < curr->end) + prev_end = curr->end; + } else { + curr->start = prev_end; + prev_end = curr->end; + } + } + return 0; +} + +#endif diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c new file mode 100644 index 0000000000..104544359d --- /dev/null +++ b/arch/x86/mm/numa_32.c @@ -0,0 +1,59 @@ +/* + * Written by: Patricia Gaughen <gone@us.ibm.com>, IBM Corporation + * August 2002: added remote node KVA remap - Martin J. Bligh + * + * Copyright (C) 2002, IBM Corp. + * + * All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include <linux/memblock.h> +#include <linux/init.h> + +#include "numa_internal.h" + +extern unsigned long highend_pfn, highstart_pfn; + +void __init initmem_init(void) +{ + x86_numa_init(); + +#ifdef CONFIG_HIGHMEM + highstart_pfn = highend_pfn = max_pfn; + if (max_pfn > max_low_pfn) + highstart_pfn = max_low_pfn; + printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", + pages_to_mb(highend_pfn - highstart_pfn)); + high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1; +#else + high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1; +#endif + printk(KERN_NOTICE "%ldMB LOWMEM available.\n", + pages_to_mb(max_low_pfn)); + printk(KERN_DEBUG "max_low_pfn = %lx, highstart_pfn = %lx\n", + max_low_pfn, highstart_pfn); + + printk(KERN_DEBUG "Low memory ends at vaddr %08lx\n", + (ulong) pfn_to_kaddr(max_low_pfn)); + + printk(KERN_DEBUG "High memory starts at vaddr %08lx\n", + (ulong) pfn_to_kaddr(highstart_pfn)); + + __vmalloc_start_set = true; + setup_bootmem_allocator(); +} diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c new file mode 100644 index 0000000000..59d80160fa --- /dev/null +++ b/arch/x86/mm/numa_64.c @@ -0,0 +1,13 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Generic VM initialization for x86-64 NUMA setups. + * Copyright 2002,2003 Andi Kleen, SuSE Labs. + */ +#include <linux/memblock.h> + +#include "numa_internal.h" + +void __init initmem_init(void) +{ + x86_numa_init(); +} diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c new file mode 100644 index 0000000000..9a9305367f --- /dev/null +++ b/arch/x86/mm/numa_emulation.c @@ -0,0 +1,585 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * NUMA emulation + */ +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/topology.h> +#include <linux/memblock.h> +#include <asm/dma.h> + +#include "numa_internal.h" + +static int emu_nid_to_phys[MAX_NUMNODES]; +static char *emu_cmdline __initdata; + +int __init numa_emu_cmdline(char *str) +{ + emu_cmdline = str; + return 0; +} + +static int __init emu_find_memblk_by_nid(int nid, const struct numa_meminfo *mi) +{ + int i; + + for (i = 0; i < mi->nr_blks; i++) + if (mi->blk[i].nid == nid) + return i; + return -ENOENT; +} + +static u64 __init mem_hole_size(u64 start, u64 end) +{ + unsigned long start_pfn = PFN_UP(start); + unsigned long end_pfn = PFN_DOWN(end); + + if (start_pfn < end_pfn) + return PFN_PHYS(absent_pages_in_range(start_pfn, end_pfn)); + return 0; +} + +/* + * Sets up nid to range from @start to @end. The return value is -errno if + * something went wrong, 0 otherwise. + */ +static int __init emu_setup_memblk(struct numa_meminfo *ei, + struct numa_meminfo *pi, + int nid, int phys_blk, u64 size) +{ + struct numa_memblk *eb = &ei->blk[ei->nr_blks]; + struct numa_memblk *pb = &pi->blk[phys_blk]; + + if (ei->nr_blks >= NR_NODE_MEMBLKS) { + pr_err("NUMA: Too many emulated memblks, failing emulation\n"); + return -EINVAL; + } + + ei->nr_blks++; + eb->start = pb->start; + eb->end = pb->start + size; + eb->nid = nid; + + if (emu_nid_to_phys[nid] == NUMA_NO_NODE) + emu_nid_to_phys[nid] = pb->nid; + + pb->start += size; + if (pb->start >= pb->end) { + WARN_ON_ONCE(pb->start > pb->end); + numa_remove_memblk_from(phys_blk, pi); + } + + printk(KERN_INFO "Faking node %d at [mem %#018Lx-%#018Lx] (%LuMB)\n", + nid, eb->start, eb->end - 1, (eb->end - eb->start) >> 20); + return 0; +} + +/* + * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr + * to max_addr. + * + * Returns zero on success or negative on error. + */ +static int __init split_nodes_interleave(struct numa_meminfo *ei, + struct numa_meminfo *pi, + u64 addr, u64 max_addr, int nr_nodes) +{ + nodemask_t physnode_mask = numa_nodes_parsed; + u64 size; + int big; + int nid = 0; + int i, ret; + + if (nr_nodes <= 0) + return -1; + if (nr_nodes > MAX_NUMNODES) { + pr_info("numa=fake=%d too large, reducing to %d\n", + nr_nodes, MAX_NUMNODES); + nr_nodes = MAX_NUMNODES; + } + + /* + * Calculate target node size. x86_32 freaks on __udivdi3() so do + * the division in ulong number of pages and convert back. + */ + size = max_addr - addr - mem_hole_size(addr, max_addr); + size = PFN_PHYS((unsigned long)(size >> PAGE_SHIFT) / nr_nodes); + + /* + * Calculate the number of big nodes that can be allocated as a result + * of consolidating the remainder. + */ + big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) / + FAKE_NODE_MIN_SIZE; + + size &= FAKE_NODE_MIN_HASH_MASK; + if (!size) { + pr_err("Not enough memory for each node. " + "NUMA emulation disabled.\n"); + return -1; + } + + /* + * Continue to fill physical nodes with fake nodes until there is no + * memory left on any of them. + */ + while (!nodes_empty(physnode_mask)) { + for_each_node_mask(i, physnode_mask) { + u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN); + u64 start, limit, end; + int phys_blk; + + phys_blk = emu_find_memblk_by_nid(i, pi); + if (phys_blk < 0) { + node_clear(i, physnode_mask); + continue; + } + start = pi->blk[phys_blk].start; + limit = pi->blk[phys_blk].end; + end = start + size; + + if (nid < big) + end += FAKE_NODE_MIN_SIZE; + + /* + * Continue to add memory to this fake node if its + * non-reserved memory is less than the per-node size. + */ + while (end - start - mem_hole_size(start, end) < size) { + end += FAKE_NODE_MIN_SIZE; + if (end > limit) { + end = limit; + break; + } + } + + /* + * If there won't be at least FAKE_NODE_MIN_SIZE of + * non-reserved memory in ZONE_DMA32 for the next node, + * this one must extend to the boundary. + */ + if (end < dma32_end && dma32_end - end - + mem_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) + end = dma32_end; + + /* + * If there won't be enough non-reserved memory for the + * next node, this one must extend to the end of the + * physical node. + */ + if (limit - end - mem_hole_size(end, limit) < size) + end = limit; + + ret = emu_setup_memblk(ei, pi, nid++ % nr_nodes, + phys_blk, + min(end, limit) - start); + if (ret < 0) + return ret; + } + } + return 0; +} + +/* + * Returns the end address of a node so that there is at least `size' amount of + * non-reserved memory or `max_addr' is reached. + */ +static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size) +{ + u64 end = start + size; + + while (end - start - mem_hole_size(start, end) < size) { + end += FAKE_NODE_MIN_SIZE; + if (end > max_addr) { + end = max_addr; + break; + } + } + return end; +} + +static u64 uniform_size(u64 max_addr, u64 base, u64 hole, int nr_nodes) +{ + unsigned long max_pfn = PHYS_PFN(max_addr); + unsigned long base_pfn = PHYS_PFN(base); + unsigned long hole_pfns = PHYS_PFN(hole); + + return PFN_PHYS((max_pfn - base_pfn - hole_pfns) / nr_nodes); +} + +/* + * Sets up fake nodes of `size' interleaved over physical nodes ranging from + * `addr' to `max_addr'. + * + * Returns zero on success or negative on error. + */ +static int __init split_nodes_size_interleave_uniform(struct numa_meminfo *ei, + struct numa_meminfo *pi, + u64 addr, u64 max_addr, u64 size, + int nr_nodes, struct numa_memblk *pblk, + int nid) +{ + nodemask_t physnode_mask = numa_nodes_parsed; + int i, ret, uniform = 0; + u64 min_size; + + if ((!size && !nr_nodes) || (nr_nodes && !pblk)) + return -1; + + /* + * In the 'uniform' case split the passed in physical node by + * nr_nodes, in the non-uniform case, ignore the passed in + * physical block and try to create nodes of at least size + * @size. + * + * In the uniform case, split the nodes strictly by physical + * capacity, i.e. ignore holes. In the non-uniform case account + * for holes and treat @size as a minimum floor. + */ + if (!nr_nodes) + nr_nodes = MAX_NUMNODES; + else { + nodes_clear(physnode_mask); + node_set(pblk->nid, physnode_mask); + uniform = 1; + } + + if (uniform) { + min_size = uniform_size(max_addr, addr, 0, nr_nodes); + size = min_size; + } else { + /* + * The limit on emulated nodes is MAX_NUMNODES, so the + * size per node is increased accordingly if the + * requested size is too small. This creates a uniform + * distribution of node sizes across the entire machine + * (but not necessarily over physical nodes). + */ + min_size = uniform_size(max_addr, addr, + mem_hole_size(addr, max_addr), nr_nodes); + } + min_size = ALIGN(max(min_size, FAKE_NODE_MIN_SIZE), FAKE_NODE_MIN_SIZE); + if (size < min_size) { + pr_err("Fake node size %LuMB too small, increasing to %LuMB\n", + size >> 20, min_size >> 20); + size = min_size; + } + size = ALIGN_DOWN(size, FAKE_NODE_MIN_SIZE); + + /* + * Fill physical nodes with fake nodes of size until there is no memory + * left on any of them. + */ + while (!nodes_empty(physnode_mask)) { + for_each_node_mask(i, physnode_mask) { + u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN); + u64 start, limit, end; + int phys_blk; + + phys_blk = emu_find_memblk_by_nid(i, pi); + if (phys_blk < 0) { + node_clear(i, physnode_mask); + continue; + } + + start = pi->blk[phys_blk].start; + limit = pi->blk[phys_blk].end; + + if (uniform) + end = start + size; + else + end = find_end_of_node(start, limit, size); + /* + * If there won't be at least FAKE_NODE_MIN_SIZE of + * non-reserved memory in ZONE_DMA32 for the next node, + * this one must extend to the boundary. + */ + if (end < dma32_end && dma32_end - end - + mem_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) + end = dma32_end; + + /* + * If there won't be enough non-reserved memory for the + * next node, this one must extend to the end of the + * physical node. + */ + if ((limit - end - mem_hole_size(end, limit) < size) + && !uniform) + end = limit; + + ret = emu_setup_memblk(ei, pi, nid++ % MAX_NUMNODES, + phys_blk, + min(end, limit) - start); + if (ret < 0) + return ret; + } + } + return nid; +} + +static int __init split_nodes_size_interleave(struct numa_meminfo *ei, + struct numa_meminfo *pi, + u64 addr, u64 max_addr, u64 size) +{ + return split_nodes_size_interleave_uniform(ei, pi, addr, max_addr, size, + 0, NULL, 0); +} + +static int __init setup_emu2phys_nid(int *dfl_phys_nid) +{ + int i, max_emu_nid = 0; + + *dfl_phys_nid = NUMA_NO_NODE; + for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) { + if (emu_nid_to_phys[i] != NUMA_NO_NODE) { + max_emu_nid = i; + if (*dfl_phys_nid == NUMA_NO_NODE) + *dfl_phys_nid = emu_nid_to_phys[i]; + } + } + + return max_emu_nid; +} + +/** + * numa_emulation - Emulate NUMA nodes + * @numa_meminfo: NUMA configuration to massage + * @numa_dist_cnt: The size of the physical NUMA distance table + * + * Emulate NUMA nodes according to the numa=fake kernel parameter. + * @numa_meminfo contains the physical memory configuration and is modified + * to reflect the emulated configuration on success. @numa_dist_cnt is + * used to determine the size of the physical distance table. + * + * On success, the following modifications are made. + * + * - @numa_meminfo is updated to reflect the emulated nodes. + * + * - __apicid_to_node[] is updated such that APIC IDs are mapped to the + * emulated nodes. + * + * - NUMA distance table is rebuilt to represent distances between emulated + * nodes. The distances are determined considering how emulated nodes + * are mapped to physical nodes and match the actual distances. + * + * - emu_nid_to_phys[] reflects how emulated nodes are mapped to physical + * nodes. This is used by numa_add_cpu() and numa_remove_cpu(). + * + * If emulation is not enabled or fails, emu_nid_to_phys[] is filled with + * identity mapping and no other modification is made. + */ +void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) +{ + static struct numa_meminfo ei __initdata; + static struct numa_meminfo pi __initdata; + const u64 max_addr = PFN_PHYS(max_pfn); + u8 *phys_dist = NULL; + size_t phys_size = numa_dist_cnt * numa_dist_cnt * sizeof(phys_dist[0]); + int max_emu_nid, dfl_phys_nid; + int i, j, ret; + + if (!emu_cmdline) + goto no_emu; + + memset(&ei, 0, sizeof(ei)); + pi = *numa_meminfo; + + for (i = 0; i < MAX_NUMNODES; i++) + emu_nid_to_phys[i] = NUMA_NO_NODE; + + /* + * If the numa=fake command-line contains a 'M' or 'G', it represents + * the fixed node size. Otherwise, if it is just a single number N, + * split the system RAM into N fake nodes. + */ + if (strchr(emu_cmdline, 'U')) { + nodemask_t physnode_mask = numa_nodes_parsed; + unsigned long n; + int nid = 0; + + n = simple_strtoul(emu_cmdline, &emu_cmdline, 0); + ret = -1; + for_each_node_mask(i, physnode_mask) { + /* + * The reason we pass in blk[0] is due to + * numa_remove_memblk_from() called by + * emu_setup_memblk() will delete entry 0 + * and then move everything else up in the pi.blk + * array. Therefore we should always be looking + * at blk[0]. + */ + ret = split_nodes_size_interleave_uniform(&ei, &pi, + pi.blk[0].start, pi.blk[0].end, 0, + n, &pi.blk[0], nid); + if (ret < 0) + break; + if (ret < n) { + pr_info("%s: phys: %d only got %d of %ld nodes, failing\n", + __func__, i, ret, n); + ret = -1; + break; + } + nid = ret; + } + } else if (strchr(emu_cmdline, 'M') || strchr(emu_cmdline, 'G')) { + u64 size; + + size = memparse(emu_cmdline, &emu_cmdline); + ret = split_nodes_size_interleave(&ei, &pi, 0, max_addr, size); + } else { + unsigned long n; + + n = simple_strtoul(emu_cmdline, &emu_cmdline, 0); + ret = split_nodes_interleave(&ei, &pi, 0, max_addr, n); + } + if (*emu_cmdline == ':') + emu_cmdline++; + + if (ret < 0) + goto no_emu; + + if (numa_cleanup_meminfo(&ei) < 0) { + pr_warn("NUMA: Warning: constructed meminfo invalid, disabling emulation\n"); + goto no_emu; + } + + /* copy the physical distance table */ + if (numa_dist_cnt) { + u64 phys; + + phys = memblock_phys_alloc_range(phys_size, PAGE_SIZE, 0, + PFN_PHYS(max_pfn_mapped)); + if (!phys) { + pr_warn("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n"); + goto no_emu; + } + phys_dist = __va(phys); + + for (i = 0; i < numa_dist_cnt; i++) + for (j = 0; j < numa_dist_cnt; j++) + phys_dist[i * numa_dist_cnt + j] = + node_distance(i, j); + } + + /* + * Determine the max emulated nid and the default phys nid to use + * for unmapped nodes. + */ + max_emu_nid = setup_emu2phys_nid(&dfl_phys_nid); + + /* commit */ + *numa_meminfo = ei; + + /* Make sure numa_nodes_parsed only contains emulated nodes */ + nodes_clear(numa_nodes_parsed); + for (i = 0; i < ARRAY_SIZE(ei.blk); i++) + if (ei.blk[i].start != ei.blk[i].end && + ei.blk[i].nid != NUMA_NO_NODE) + node_set(ei.blk[i].nid, numa_nodes_parsed); + + /* + * Transform __apicid_to_node table to use emulated nids by + * reverse-mapping phys_nid. The maps should always exist but fall + * back to zero just in case. + */ + for (i = 0; i < ARRAY_SIZE(__apicid_to_node); i++) { + if (__apicid_to_node[i] == NUMA_NO_NODE) + continue; + for (j = 0; j < ARRAY_SIZE(emu_nid_to_phys); j++) + if (__apicid_to_node[i] == emu_nid_to_phys[j]) + break; + __apicid_to_node[i] = j < ARRAY_SIZE(emu_nid_to_phys) ? j : 0; + } + + /* make sure all emulated nodes are mapped to a physical node */ + for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) + if (emu_nid_to_phys[i] == NUMA_NO_NODE) + emu_nid_to_phys[i] = dfl_phys_nid; + + /* transform distance table */ + numa_reset_distance(); + for (i = 0; i < max_emu_nid + 1; i++) { + for (j = 0; j < max_emu_nid + 1; j++) { + int physi = emu_nid_to_phys[i]; + int physj = emu_nid_to_phys[j]; + int dist; + + if (get_option(&emu_cmdline, &dist) == 2) + ; + else if (physi >= numa_dist_cnt || physj >= numa_dist_cnt) + dist = physi == physj ? + LOCAL_DISTANCE : REMOTE_DISTANCE; + else + dist = phys_dist[physi * numa_dist_cnt + physj]; + + numa_set_distance(i, j, dist); + } + } + + /* free the copied physical distance table */ + memblock_free(phys_dist, phys_size); + return; + +no_emu: + /* No emulation. Build identity emu_nid_to_phys[] for numa_add_cpu() */ + for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) + emu_nid_to_phys[i] = i; +} + +#ifndef CONFIG_DEBUG_PER_CPU_MAPS +void numa_add_cpu(int cpu) +{ + int physnid, nid; + + nid = early_cpu_to_node(cpu); + BUG_ON(nid == NUMA_NO_NODE || !node_online(nid)); + + physnid = emu_nid_to_phys[nid]; + + /* + * Map the cpu to each emulated node that is allocated on the physical + * node of the cpu's apic id. + */ + for_each_online_node(nid) + if (emu_nid_to_phys[nid] == physnid) + cpumask_set_cpu(cpu, node_to_cpumask_map[nid]); +} + +void numa_remove_cpu(int cpu) +{ + int i; + + for_each_online_node(i) + cpumask_clear_cpu(cpu, node_to_cpumask_map[i]); +} +#else /* !CONFIG_DEBUG_PER_CPU_MAPS */ +static void numa_set_cpumask(int cpu, bool enable) +{ + int nid, physnid; + + nid = early_cpu_to_node(cpu); + if (nid == NUMA_NO_NODE) { + /* early_cpu_to_node() already emits a warning and trace */ + return; + } + + physnid = emu_nid_to_phys[nid]; + + for_each_online_node(nid) { + if (emu_nid_to_phys[nid] != physnid) + continue; + + debug_cpumask_set_cpu(cpu, nid, enable); + } +} + +void numa_add_cpu(int cpu) +{ + numa_set_cpumask(cpu, true); +} + +void numa_remove_cpu(int cpu) +{ + numa_set_cpumask(cpu, false); +} +#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ diff --git a/arch/x86/mm/numa_internal.h b/arch/x86/mm/numa_internal.h new file mode 100644 index 0000000000..86860f2796 --- /dev/null +++ b/arch/x86/mm/numa_internal.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __X86_MM_NUMA_INTERNAL_H +#define __X86_MM_NUMA_INTERNAL_H + +#include <linux/types.h> +#include <asm/numa.h> + +struct numa_memblk { + u64 start; + u64 end; + int nid; +}; + +struct numa_meminfo { + int nr_blks; + struct numa_memblk blk[NR_NODE_MEMBLKS]; +}; + +void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi); +int __init numa_cleanup_meminfo(struct numa_meminfo *mi); +void __init numa_reset_distance(void); + +void __init x86_numa_init(void); + +#ifdef CONFIG_NUMA_EMU +void __init numa_emulation(struct numa_meminfo *numa_meminfo, + int numa_dist_cnt); +#else +static inline void numa_emulation(struct numa_meminfo *numa_meminfo, + int numa_dist_cnt) +{ } +#endif + +#endif /* __X86_MM_NUMA_INTERNAL_H */ diff --git a/arch/x86/mm/pat/Makefile b/arch/x86/mm/pat/Makefile new file mode 100644 index 0000000000..ea464c9951 --- /dev/null +++ b/arch/x86/mm/pat/Makefile @@ -0,0 +1,5 @@ +# SPDX-License-Identifier: GPL-2.0 + +obj-y := set_memory.o memtype.o + +obj-$(CONFIG_X86_PAT) += memtype_interval.o diff --git a/arch/x86/mm/pat/cpa-test.c b/arch/x86/mm/pat/cpa-test.c new file mode 100644 index 0000000000..3d2f7f0a6e --- /dev/null +++ b/arch/x86/mm/pat/cpa-test.c @@ -0,0 +1,277 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * self test for change_page_attr. + * + * Clears the a test pte bit on random pages in the direct mapping, + * then reverts and compares page tables forwards and afterwards. + */ +#include <linux/memblock.h> +#include <linux/kthread.h> +#include <linux/random.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/mm.h> +#include <linux/vmalloc.h> + +#include <asm/cacheflush.h> +#include <asm/kdebug.h> + +/* + * Only print the results of the first pass: + */ +static __read_mostly int print = 1; + +enum { + NTEST = 3 * 100, + NPAGES = 100, +#ifdef CONFIG_X86_64 + LPS = (1 << PMD_SHIFT), +#elif defined(CONFIG_X86_PAE) + LPS = (1 << PMD_SHIFT), +#else + LPS = (1 << 22), +#endif + GPS = (1<<30) +}; + +#define PAGE_CPA_TEST __pgprot(_PAGE_CPA_TEST) + +static int pte_testbit(pte_t pte) +{ + return pte_flags(pte) & _PAGE_SOFTW1; +} + +struct split_state { + long lpg, gpg, spg, exec; + long min_exec, max_exec; +}; + +static int print_split(struct split_state *s) +{ + long i, expected, missed = 0; + int err = 0; + + s->lpg = s->gpg = s->spg = s->exec = 0; + s->min_exec = ~0UL; + s->max_exec = 0; + for (i = 0; i < max_pfn_mapped; ) { + unsigned long addr = (unsigned long)__va(i << PAGE_SHIFT); + unsigned int level; + pte_t *pte; + + pte = lookup_address(addr, &level); + if (!pte) { + missed++; + i++; + continue; + } + + if (level == PG_LEVEL_1G && sizeof(long) == 8) { + s->gpg++; + i += GPS/PAGE_SIZE; + } else if (level == PG_LEVEL_2M) { + if ((pte_val(*pte) & _PAGE_PRESENT) && !(pte_val(*pte) & _PAGE_PSE)) { + printk(KERN_ERR + "%lx level %d but not PSE %Lx\n", + addr, level, (u64)pte_val(*pte)); + err = 1; + } + s->lpg++; + i += LPS/PAGE_SIZE; + } else { + s->spg++; + i++; + } + if (!(pte_val(*pte) & _PAGE_NX)) { + s->exec++; + if (addr < s->min_exec) + s->min_exec = addr; + if (addr > s->max_exec) + s->max_exec = addr; + } + } + if (print) { + printk(KERN_INFO + " 4k %lu large %lu gb %lu x %lu[%lx-%lx] miss %lu\n", + s->spg, s->lpg, s->gpg, s->exec, + s->min_exec != ~0UL ? s->min_exec : 0, + s->max_exec, missed); + } + + expected = (s->gpg*GPS + s->lpg*LPS)/PAGE_SIZE + s->spg + missed; + if (expected != i) { + printk(KERN_ERR "CPA max_pfn_mapped %lu but expected %lu\n", + max_pfn_mapped, expected); + return 1; + } + return err; +} + +static unsigned long addr[NTEST]; +static unsigned int len[NTEST]; + +static struct page *pages[NPAGES]; +static unsigned long addrs[NPAGES]; + +/* Change the global bit on random pages in the direct mapping */ +static int pageattr_test(void) +{ + struct split_state sa, sb, sc; + unsigned long *bm; + pte_t *pte, pte0; + int failed = 0; + unsigned int level; + int i, k; + int err; + + if (print) + printk(KERN_INFO "CPA self-test:\n"); + + bm = vzalloc((max_pfn_mapped + 7) / 8); + if (!bm) { + printk(KERN_ERR "CPA Cannot vmalloc bitmap\n"); + return -ENOMEM; + } + + failed += print_split(&sa); + + for (i = 0; i < NTEST; i++) { + unsigned long pfn = get_random_u32_below(max_pfn_mapped); + + addr[i] = (unsigned long)__va(pfn << PAGE_SHIFT); + len[i] = get_random_u32_below(NPAGES); + len[i] = min_t(unsigned long, len[i], max_pfn_mapped - pfn - 1); + + if (len[i] == 0) + len[i] = 1; + + pte = NULL; + pte0 = pfn_pte(0, __pgprot(0)); /* shut gcc up */ + + for (k = 0; k < len[i]; k++) { + pte = lookup_address(addr[i] + k*PAGE_SIZE, &level); + if (!pte || pgprot_val(pte_pgprot(*pte)) == 0 || + !(pte_val(*pte) & _PAGE_PRESENT)) { + addr[i] = 0; + break; + } + if (k == 0) { + pte0 = *pte; + } else { + if (pgprot_val(pte_pgprot(*pte)) != + pgprot_val(pte_pgprot(pte0))) { + len[i] = k; + break; + } + } + if (test_bit(pfn + k, bm)) { + len[i] = k; + break; + } + __set_bit(pfn + k, bm); + addrs[k] = addr[i] + k*PAGE_SIZE; + pages[k] = pfn_to_page(pfn + k); + } + if (!addr[i] || !pte || !k) { + addr[i] = 0; + continue; + } + + switch (i % 3) { + case 0: + err = change_page_attr_set(&addr[i], len[i], PAGE_CPA_TEST, 0); + break; + + case 1: + err = change_page_attr_set(addrs, len[1], PAGE_CPA_TEST, 1); + break; + + case 2: + err = cpa_set_pages_array(pages, len[i], PAGE_CPA_TEST); + break; + } + + + if (err < 0) { + printk(KERN_ERR "CPA %d failed %d\n", i, err); + failed++; + } + + pte = lookup_address(addr[i], &level); + if (!pte || !pte_testbit(*pte) || pte_huge(*pte)) { + printk(KERN_ERR "CPA %lx: bad pte %Lx\n", addr[i], + pte ? (u64)pte_val(*pte) : 0ULL); + failed++; + } + if (level != PG_LEVEL_4K) { + printk(KERN_ERR "CPA %lx: unexpected level %d\n", + addr[i], level); + failed++; + } + + } + vfree(bm); + + failed += print_split(&sb); + + for (i = 0; i < NTEST; i++) { + if (!addr[i]) + continue; + pte = lookup_address(addr[i], &level); + if (!pte) { + printk(KERN_ERR "CPA lookup of %lx failed\n", addr[i]); + failed++; + continue; + } + err = change_page_attr_clear(&addr[i], len[i], PAGE_CPA_TEST, 0); + if (err < 0) { + printk(KERN_ERR "CPA reverting failed: %d\n", err); + failed++; + } + pte = lookup_address(addr[i], &level); + if (!pte || pte_testbit(*pte)) { + printk(KERN_ERR "CPA %lx: bad pte after revert %Lx\n", + addr[i], pte ? (u64)pte_val(*pte) : 0ULL); + failed++; + } + + } + + failed += print_split(&sc); + + if (failed) { + WARN(1, KERN_ERR "NOT PASSED. Please report.\n"); + return -EINVAL; + } else { + if (print) + printk(KERN_INFO "ok.\n"); + } + + return 0; +} + +static int do_pageattr_test(void *__unused) +{ + while (!kthread_should_stop()) { + schedule_timeout_interruptible(HZ*30); + if (pageattr_test() < 0) + break; + if (print) + print--; + } + return 0; +} + +static int start_pageattr_test(void) +{ + struct task_struct *p; + + p = kthread_create(do_pageattr_test, NULL, "pageattr-test"); + if (!IS_ERR(p)) + wake_up_process(p); + else + WARN_ON(1); + + return 0; +} +device_initcall(start_pageattr_test); diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c new file mode 100644 index 0000000000..de10800cd4 --- /dev/null +++ b/arch/x86/mm/pat/memtype.c @@ -0,0 +1,1194 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Page Attribute Table (PAT) support: handle memory caching attributes in page tables. + * + * Authors: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> + * Suresh B Siddha <suresh.b.siddha@intel.com> + * + * Loosely based on earlier PAT patchset from Eric Biederman and Andi Kleen. + * + * Basic principles: + * + * PAT is a CPU feature supported by all modern x86 CPUs, to allow the firmware and + * the kernel to set one of a handful of 'caching type' attributes for physical + * memory ranges: uncached, write-combining, write-through, write-protected, + * and the most commonly used and default attribute: write-back caching. + * + * PAT support supercedes and augments MTRR support in a compatible fashion: MTRR is + * a hardware interface to enumerate a limited number of physical memory ranges + * and set their caching attributes explicitly, programmed into the CPU via MSRs. + * Even modern CPUs have MTRRs enabled - but these are typically not touched + * by the kernel or by user-space (such as the X server), we rely on PAT for any + * additional cache attribute logic. + * + * PAT doesn't work via explicit memory ranges, but uses page table entries to add + * cache attribute information to the mapped memory range: there's 3 bits used, + * (_PAGE_PWT, _PAGE_PCD, _PAGE_PAT), with the 8 possible values mapped by the + * CPU to actual cache attributes via an MSR loaded into the CPU (MSR_IA32_CR_PAT). + * + * ( There's a metric ton of finer details, such as compatibility with CPU quirks + * that only support 4 types of PAT entries, and interaction with MTRRs, see + * below for details. ) + */ + +#include <linux/seq_file.h> +#include <linux/memblock.h> +#include <linux/debugfs.h> +#include <linux/ioport.h> +#include <linux/kernel.h> +#include <linux/pfn_t.h> +#include <linux/slab.h> +#include <linux/mm.h> +#include <linux/fs.h> +#include <linux/rbtree.h> + +#include <asm/cacheflush.h> +#include <asm/cacheinfo.h> +#include <asm/processor.h> +#include <asm/tlbflush.h> +#include <asm/x86_init.h> +#include <asm/fcntl.h> +#include <asm/e820/api.h> +#include <asm/mtrr.h> +#include <asm/page.h> +#include <asm/msr.h> +#include <asm/memtype.h> +#include <asm/io.h> + +#include "memtype.h" +#include "../mm_internal.h" + +#undef pr_fmt +#define pr_fmt(fmt) "" fmt + +static bool __read_mostly pat_disabled = !IS_ENABLED(CONFIG_X86_PAT); +static u64 __ro_after_init pat_msr_val; + +/* + * PAT support is enabled by default, but can be disabled for + * various user-requested or hardware-forced reasons: + */ +static void __init pat_disable(const char *msg_reason) +{ + if (pat_disabled) + return; + + pat_disabled = true; + pr_info("x86/PAT: %s\n", msg_reason); + + memory_caching_control &= ~CACHE_PAT; +} + +static int __init nopat(char *str) +{ + pat_disable("PAT support disabled via boot option."); + return 0; +} +early_param("nopat", nopat); + +bool pat_enabled(void) +{ + return !pat_disabled; +} +EXPORT_SYMBOL_GPL(pat_enabled); + +int pat_debug_enable; + +static int __init pat_debug_setup(char *str) +{ + pat_debug_enable = 1; + return 1; +} +__setup("debugpat", pat_debug_setup); + +#ifdef CONFIG_X86_PAT +/* + * X86 PAT uses page flags arch_1 and uncached together to keep track of + * memory type of pages that have backing page struct. + * + * X86 PAT supports 4 different memory types: + * - _PAGE_CACHE_MODE_WB + * - _PAGE_CACHE_MODE_WC + * - _PAGE_CACHE_MODE_UC_MINUS + * - _PAGE_CACHE_MODE_WT + * + * _PAGE_CACHE_MODE_WB is the default type. + */ + +#define _PGMT_WB 0 +#define _PGMT_WC (1UL << PG_arch_1) +#define _PGMT_UC_MINUS (1UL << PG_uncached) +#define _PGMT_WT (1UL << PG_uncached | 1UL << PG_arch_1) +#define _PGMT_MASK (1UL << PG_uncached | 1UL << PG_arch_1) +#define _PGMT_CLEAR_MASK (~_PGMT_MASK) + +static inline enum page_cache_mode get_page_memtype(struct page *pg) +{ + unsigned long pg_flags = pg->flags & _PGMT_MASK; + + if (pg_flags == _PGMT_WB) + return _PAGE_CACHE_MODE_WB; + else if (pg_flags == _PGMT_WC) + return _PAGE_CACHE_MODE_WC; + else if (pg_flags == _PGMT_UC_MINUS) + return _PAGE_CACHE_MODE_UC_MINUS; + else + return _PAGE_CACHE_MODE_WT; +} + +static inline void set_page_memtype(struct page *pg, + enum page_cache_mode memtype) +{ + unsigned long memtype_flags; + unsigned long old_flags; + unsigned long new_flags; + + switch (memtype) { + case _PAGE_CACHE_MODE_WC: + memtype_flags = _PGMT_WC; + break; + case _PAGE_CACHE_MODE_UC_MINUS: + memtype_flags = _PGMT_UC_MINUS; + break; + case _PAGE_CACHE_MODE_WT: + memtype_flags = _PGMT_WT; + break; + case _PAGE_CACHE_MODE_WB: + default: + memtype_flags = _PGMT_WB; + break; + } + + old_flags = READ_ONCE(pg->flags); + do { + new_flags = (old_flags & _PGMT_CLEAR_MASK) | memtype_flags; + } while (!try_cmpxchg(&pg->flags, &old_flags, new_flags)); +} +#else +static inline enum page_cache_mode get_page_memtype(struct page *pg) +{ + return -1; +} +static inline void set_page_memtype(struct page *pg, + enum page_cache_mode memtype) +{ +} +#endif + +enum { + PAT_UC = 0, /* uncached */ + PAT_WC = 1, /* Write combining */ + PAT_WT = 4, /* Write Through */ + PAT_WP = 5, /* Write Protected */ + PAT_WB = 6, /* Write Back (default) */ + PAT_UC_MINUS = 7, /* UC, but can be overridden by MTRR */ +}; + +#define CM(c) (_PAGE_CACHE_MODE_ ## c) + +static enum page_cache_mode __init pat_get_cache_mode(unsigned int pat_val, + char *msg) +{ + enum page_cache_mode cache; + char *cache_mode; + + switch (pat_val) { + case PAT_UC: cache = CM(UC); cache_mode = "UC "; break; + case PAT_WC: cache = CM(WC); cache_mode = "WC "; break; + case PAT_WT: cache = CM(WT); cache_mode = "WT "; break; + case PAT_WP: cache = CM(WP); cache_mode = "WP "; break; + case PAT_WB: cache = CM(WB); cache_mode = "WB "; break; + case PAT_UC_MINUS: cache = CM(UC_MINUS); cache_mode = "UC- "; break; + default: cache = CM(WB); cache_mode = "WB "; break; + } + + memcpy(msg, cache_mode, 4); + + return cache; +} + +#undef CM + +/* + * Update the cache mode to pgprot translation tables according to PAT + * configuration. + * Using lower indices is preferred, so we start with highest index. + */ +static void __init init_cache_modes(u64 pat) +{ + enum page_cache_mode cache; + char pat_msg[33]; + int i; + + pat_msg[32] = 0; + for (i = 7; i >= 0; i--) { + cache = pat_get_cache_mode((pat >> (i * 8)) & 7, + pat_msg + 4 * i); + update_cache_mode_entry(i, cache); + } + pr_info("x86/PAT: Configuration [0-7]: %s\n", pat_msg); +} + +void pat_cpu_init(void) +{ + if (!boot_cpu_has(X86_FEATURE_PAT)) { + /* + * If this happens we are on a secondary CPU, but switched to + * PAT on the boot CPU. We have no way to undo PAT. + */ + panic("x86/PAT: PAT enabled, but not supported by secondary CPU\n"); + } + + wrmsrl(MSR_IA32_CR_PAT, pat_msr_val); +} + +/** + * pat_bp_init - Initialize the PAT MSR value and PAT table + * + * This function initializes PAT MSR value and PAT table with an OS-defined + * value to enable additional cache attributes, WC, WT and WP. + * + * This function prepares the calls of pat_cpu_init() via cache_cpu_init() + * on all CPUs. + */ +void __init pat_bp_init(void) +{ + struct cpuinfo_x86 *c = &boot_cpu_data; +#define PAT(p0, p1, p2, p3, p4, p5, p6, p7) \ + (((u64)PAT_ ## p0) | ((u64)PAT_ ## p1 << 8) | \ + ((u64)PAT_ ## p2 << 16) | ((u64)PAT_ ## p3 << 24) | \ + ((u64)PAT_ ## p4 << 32) | ((u64)PAT_ ## p5 << 40) | \ + ((u64)PAT_ ## p6 << 48) | ((u64)PAT_ ## p7 << 56)) + + + if (!IS_ENABLED(CONFIG_X86_PAT)) + pr_info_once("x86/PAT: PAT support disabled because CONFIG_X86_PAT is disabled in the kernel.\n"); + + if (!cpu_feature_enabled(X86_FEATURE_PAT)) + pat_disable("PAT not supported by the CPU."); + else + rdmsrl(MSR_IA32_CR_PAT, pat_msr_val); + + if (!pat_msr_val) { + pat_disable("PAT support disabled by the firmware."); + + /* + * No PAT. Emulate the PAT table that corresponds to the two + * cache bits, PWT (Write Through) and PCD (Cache Disable). + * This setup is also the same as the BIOS default setup. + * + * PTE encoding: + * + * PCD + * |PWT PAT + * || slot + * 00 0 WB : _PAGE_CACHE_MODE_WB + * 01 1 WT : _PAGE_CACHE_MODE_WT + * 10 2 UC-: _PAGE_CACHE_MODE_UC_MINUS + * 11 3 UC : _PAGE_CACHE_MODE_UC + * + * NOTE: When WC or WP is used, it is redirected to UC- per + * the default setup in __cachemode2pte_tbl[]. + */ + pat_msr_val = PAT(WB, WT, UC_MINUS, UC, WB, WT, UC_MINUS, UC); + } + + /* + * Xen PV doesn't allow to set PAT MSR, but all cache modes are + * supported. + * When running as TDX guest setting the PAT MSR won't work either + * due to the requirement to set CR0.CD when doing so. Rely on + * firmware to have set the PAT MSR correctly. + */ + if (pat_disabled || + cpu_feature_enabled(X86_FEATURE_XENPV) || + cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) { + init_cache_modes(pat_msr_val); + return; + } + + if ((c->x86_vendor == X86_VENDOR_INTEL) && + (((c->x86 == 0x6) && (c->x86_model <= 0xd)) || + ((c->x86 == 0xf) && (c->x86_model <= 0x6)))) { + /* + * PAT support with the lower four entries. Intel Pentium 2, + * 3, M, and 4 are affected by PAT errata, which makes the + * upper four entries unusable. To be on the safe side, we don't + * use those. + * + * PTE encoding: + * PAT + * |PCD + * ||PWT PAT + * ||| slot + * 000 0 WB : _PAGE_CACHE_MODE_WB + * 001 1 WC : _PAGE_CACHE_MODE_WC + * 010 2 UC-: _PAGE_CACHE_MODE_UC_MINUS + * 011 3 UC : _PAGE_CACHE_MODE_UC + * PAT bit unused + * + * NOTE: When WT or WP is used, it is redirected to UC- per + * the default setup in __cachemode2pte_tbl[]. + */ + pat_msr_val = PAT(WB, WC, UC_MINUS, UC, WB, WC, UC_MINUS, UC); + } else { + /* + * Full PAT support. We put WT in slot 7 to improve + * robustness in the presence of errata that might cause + * the high PAT bit to be ignored. This way, a buggy slot 7 + * access will hit slot 3, and slot 3 is UC, so at worst + * we lose performance without causing a correctness issue. + * Pentium 4 erratum N46 is an example for such an erratum, + * although we try not to use PAT at all on affected CPUs. + * + * PTE encoding: + * PAT + * |PCD + * ||PWT PAT + * ||| slot + * 000 0 WB : _PAGE_CACHE_MODE_WB + * 001 1 WC : _PAGE_CACHE_MODE_WC + * 010 2 UC-: _PAGE_CACHE_MODE_UC_MINUS + * 011 3 UC : _PAGE_CACHE_MODE_UC + * 100 4 WB : Reserved + * 101 5 WP : _PAGE_CACHE_MODE_WP + * 110 6 UC-: Reserved + * 111 7 WT : _PAGE_CACHE_MODE_WT + * + * The reserved slots are unused, but mapped to their + * corresponding types in the presence of PAT errata. + */ + pat_msr_val = PAT(WB, WC, UC_MINUS, UC, WB, WP, UC_MINUS, WT); + } + + memory_caching_control |= CACHE_PAT; + + init_cache_modes(pat_msr_val); +#undef PAT +} + +static DEFINE_SPINLOCK(memtype_lock); /* protects memtype accesses */ + +/* + * Does intersection of PAT memory type and MTRR memory type and returns + * the resulting memory type as PAT understands it. + * (Type in pat and mtrr will not have same value) + * The intersection is based on "Effective Memory Type" tables in IA-32 + * SDM vol 3a + */ +static unsigned long pat_x_mtrr_type(u64 start, u64 end, + enum page_cache_mode req_type) +{ + /* + * Look for MTRR hint to get the effective type in case where PAT + * request is for WB. + */ + if (req_type == _PAGE_CACHE_MODE_WB) { + u8 mtrr_type, uniform; + + mtrr_type = mtrr_type_lookup(start, end, &uniform); + if (mtrr_type != MTRR_TYPE_WRBACK) + return _PAGE_CACHE_MODE_UC_MINUS; + + return _PAGE_CACHE_MODE_WB; + } + + return req_type; +} + +struct pagerange_state { + unsigned long cur_pfn; + int ram; + int not_ram; +}; + +static int +pagerange_is_ram_callback(unsigned long initial_pfn, unsigned long total_nr_pages, void *arg) +{ + struct pagerange_state *state = arg; + + state->not_ram |= initial_pfn > state->cur_pfn; + state->ram |= total_nr_pages > 0; + state->cur_pfn = initial_pfn + total_nr_pages; + + return state->ram && state->not_ram; +} + +static int pat_pagerange_is_ram(resource_size_t start, resource_size_t end) +{ + int ret = 0; + unsigned long start_pfn = start >> PAGE_SHIFT; + unsigned long end_pfn = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; + struct pagerange_state state = {start_pfn, 0, 0}; + + /* + * For legacy reasons, physical address range in the legacy ISA + * region is tracked as non-RAM. This will allow users of + * /dev/mem to map portions of legacy ISA region, even when + * some of those portions are listed(or not even listed) with + * different e820 types(RAM/reserved/..) + */ + if (start_pfn < ISA_END_ADDRESS >> PAGE_SHIFT) + start_pfn = ISA_END_ADDRESS >> PAGE_SHIFT; + + if (start_pfn < end_pfn) { + ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn, + &state, pagerange_is_ram_callback); + } + + return (ret > 0) ? -1 : (state.ram ? 1 : 0); +} + +/* + * For RAM pages, we use page flags to mark the pages with appropriate type. + * The page flags are limited to four types, WB (default), WC, WT and UC-. + * WP request fails with -EINVAL, and UC gets redirected to UC-. Setting + * a new memory type is only allowed for a page mapped with the default WB + * type. + * + * Here we do two passes: + * - Find the memtype of all the pages in the range, look for any conflicts. + * - In case of no conflicts, set the new memtype for pages in the range. + */ +static int reserve_ram_pages_type(u64 start, u64 end, + enum page_cache_mode req_type, + enum page_cache_mode *new_type) +{ + struct page *page; + u64 pfn; + + if (req_type == _PAGE_CACHE_MODE_WP) { + if (new_type) + *new_type = _PAGE_CACHE_MODE_UC_MINUS; + return -EINVAL; + } + + if (req_type == _PAGE_CACHE_MODE_UC) { + /* We do not support strong UC */ + WARN_ON_ONCE(1); + req_type = _PAGE_CACHE_MODE_UC_MINUS; + } + + for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) { + enum page_cache_mode type; + + page = pfn_to_page(pfn); + type = get_page_memtype(page); + if (type != _PAGE_CACHE_MODE_WB) { + pr_info("x86/PAT: reserve_ram_pages_type failed [mem %#010Lx-%#010Lx], track 0x%x, req 0x%x\n", + start, end - 1, type, req_type); + if (new_type) + *new_type = type; + + return -EBUSY; + } + } + + if (new_type) + *new_type = req_type; + + for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) { + page = pfn_to_page(pfn); + set_page_memtype(page, req_type); + } + return 0; +} + +static int free_ram_pages_type(u64 start, u64 end) +{ + struct page *page; + u64 pfn; + + for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) { + page = pfn_to_page(pfn); + set_page_memtype(page, _PAGE_CACHE_MODE_WB); + } + return 0; +} + +static u64 sanitize_phys(u64 address) +{ + /* + * When changing the memtype for pages containing poison allow + * for a "decoy" virtual address (bit 63 clear) passed to + * set_memory_X(). __pa() on a "decoy" address results in a + * physical address with bit 63 set. + * + * Decoy addresses are not present for 32-bit builds, see + * set_mce_nospec(). + */ + if (IS_ENABLED(CONFIG_X86_64)) + return address & __PHYSICAL_MASK; + return address; +} + +/* + * req_type typically has one of the: + * - _PAGE_CACHE_MODE_WB + * - _PAGE_CACHE_MODE_WC + * - _PAGE_CACHE_MODE_UC_MINUS + * - _PAGE_CACHE_MODE_UC + * - _PAGE_CACHE_MODE_WT + * + * If new_type is NULL, function will return an error if it cannot reserve the + * region with req_type. If new_type is non-NULL, function will return + * available type in new_type in case of no error. In case of any error + * it will return a negative return value. + */ +int memtype_reserve(u64 start, u64 end, enum page_cache_mode req_type, + enum page_cache_mode *new_type) +{ + struct memtype *entry_new; + enum page_cache_mode actual_type; + int is_range_ram; + int err = 0; + + start = sanitize_phys(start); + + /* + * The end address passed into this function is exclusive, but + * sanitize_phys() expects an inclusive address. + */ + end = sanitize_phys(end - 1) + 1; + if (start >= end) { + WARN(1, "%s failed: [mem %#010Lx-%#010Lx], req %s\n", __func__, + start, end - 1, cattr_name(req_type)); + return -EINVAL; + } + + if (!pat_enabled()) { + /* This is identical to page table setting without PAT */ + if (new_type) + *new_type = req_type; + return 0; + } + + /* Low ISA region is always mapped WB in page table. No need to track */ + if (x86_platform.is_untracked_pat_range(start, end)) { + if (new_type) + *new_type = _PAGE_CACHE_MODE_WB; + return 0; + } + + /* + * Call mtrr_lookup to get the type hint. This is an + * optimization for /dev/mem mmap'ers into WB memory (BIOS + * tools and ACPI tools). Use WB request for WB memory and use + * UC_MINUS otherwise. + */ + actual_type = pat_x_mtrr_type(start, end, req_type); + + if (new_type) + *new_type = actual_type; + + is_range_ram = pat_pagerange_is_ram(start, end); + if (is_range_ram == 1) { + + err = reserve_ram_pages_type(start, end, req_type, new_type); + + return err; + } else if (is_range_ram < 0) { + return -EINVAL; + } + + entry_new = kzalloc(sizeof(struct memtype), GFP_KERNEL); + if (!entry_new) + return -ENOMEM; + + entry_new->start = start; + entry_new->end = end; + entry_new->type = actual_type; + + spin_lock(&memtype_lock); + + err = memtype_check_insert(entry_new, new_type); + if (err) { + pr_info("x86/PAT: memtype_reserve failed [mem %#010Lx-%#010Lx], track %s, req %s\n", + start, end - 1, + cattr_name(entry_new->type), cattr_name(req_type)); + kfree(entry_new); + spin_unlock(&memtype_lock); + + return err; + } + + spin_unlock(&memtype_lock); + + dprintk("memtype_reserve added [mem %#010Lx-%#010Lx], track %s, req %s, ret %s\n", + start, end - 1, cattr_name(entry_new->type), cattr_name(req_type), + new_type ? cattr_name(*new_type) : "-"); + + return err; +} + +int memtype_free(u64 start, u64 end) +{ + int is_range_ram; + struct memtype *entry_old; + + if (!pat_enabled()) + return 0; + + start = sanitize_phys(start); + end = sanitize_phys(end); + + /* Low ISA region is always mapped WB. No need to track */ + if (x86_platform.is_untracked_pat_range(start, end)) + return 0; + + is_range_ram = pat_pagerange_is_ram(start, end); + if (is_range_ram == 1) + return free_ram_pages_type(start, end); + if (is_range_ram < 0) + return -EINVAL; + + spin_lock(&memtype_lock); + entry_old = memtype_erase(start, end); + spin_unlock(&memtype_lock); + + if (IS_ERR(entry_old)) { + pr_info("x86/PAT: %s:%d freeing invalid memtype [mem %#010Lx-%#010Lx]\n", + current->comm, current->pid, start, end - 1); + return -EINVAL; + } + + kfree(entry_old); + + dprintk("memtype_free request [mem %#010Lx-%#010Lx]\n", start, end - 1); + + return 0; +} + + +/** + * lookup_memtype - Looks up the memory type for a physical address + * @paddr: physical address of which memory type needs to be looked up + * + * Only to be called when PAT is enabled + * + * Returns _PAGE_CACHE_MODE_WB, _PAGE_CACHE_MODE_WC, _PAGE_CACHE_MODE_UC_MINUS + * or _PAGE_CACHE_MODE_WT. + */ +static enum page_cache_mode lookup_memtype(u64 paddr) +{ + enum page_cache_mode rettype = _PAGE_CACHE_MODE_WB; + struct memtype *entry; + + if (x86_platform.is_untracked_pat_range(paddr, paddr + PAGE_SIZE)) + return rettype; + + if (pat_pagerange_is_ram(paddr, paddr + PAGE_SIZE)) { + struct page *page; + + page = pfn_to_page(paddr >> PAGE_SHIFT); + return get_page_memtype(page); + } + + spin_lock(&memtype_lock); + + entry = memtype_lookup(paddr); + if (entry != NULL) + rettype = entry->type; + else + rettype = _PAGE_CACHE_MODE_UC_MINUS; + + spin_unlock(&memtype_lock); + + return rettype; +} + +/** + * pat_pfn_immune_to_uc_mtrr - Check whether the PAT memory type + * of @pfn cannot be overridden by UC MTRR memory type. + * + * Only to be called when PAT is enabled. + * + * Returns true, if the PAT memory type of @pfn is UC, UC-, or WC. + * Returns false in other cases. + */ +bool pat_pfn_immune_to_uc_mtrr(unsigned long pfn) +{ + enum page_cache_mode cm = lookup_memtype(PFN_PHYS(pfn)); + + return cm == _PAGE_CACHE_MODE_UC || + cm == _PAGE_CACHE_MODE_UC_MINUS || + cm == _PAGE_CACHE_MODE_WC; +} +EXPORT_SYMBOL_GPL(pat_pfn_immune_to_uc_mtrr); + +/** + * memtype_reserve_io - Request a memory type mapping for a region of memory + * @start: start (physical address) of the region + * @end: end (physical address) of the region + * @type: A pointer to memtype, with requested type. On success, requested + * or any other compatible type that was available for the region is returned + * + * On success, returns 0 + * On failure, returns non-zero + */ +int memtype_reserve_io(resource_size_t start, resource_size_t end, + enum page_cache_mode *type) +{ + resource_size_t size = end - start; + enum page_cache_mode req_type = *type; + enum page_cache_mode new_type; + int ret; + + WARN_ON_ONCE(iomem_map_sanity_check(start, size)); + + ret = memtype_reserve(start, end, req_type, &new_type); + if (ret) + goto out_err; + + if (!is_new_memtype_allowed(start, size, req_type, new_type)) + goto out_free; + + if (memtype_kernel_map_sync(start, size, new_type) < 0) + goto out_free; + + *type = new_type; + return 0; + +out_free: + memtype_free(start, end); + ret = -EBUSY; +out_err: + return ret; +} + +/** + * memtype_free_io - Release a memory type mapping for a region of memory + * @start: start (physical address) of the region + * @end: end (physical address) of the region + */ +void memtype_free_io(resource_size_t start, resource_size_t end) +{ + memtype_free(start, end); +} + +#ifdef CONFIG_X86_PAT +int arch_io_reserve_memtype_wc(resource_size_t start, resource_size_t size) +{ + enum page_cache_mode type = _PAGE_CACHE_MODE_WC; + + return memtype_reserve_io(start, start + size, &type); +} +EXPORT_SYMBOL(arch_io_reserve_memtype_wc); + +void arch_io_free_memtype_wc(resource_size_t start, resource_size_t size) +{ + memtype_free_io(start, start + size); +} +EXPORT_SYMBOL(arch_io_free_memtype_wc); +#endif + +pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, + unsigned long size, pgprot_t vma_prot) +{ + if (!phys_mem_access_encrypted(pfn << PAGE_SHIFT, size)) + vma_prot = pgprot_decrypted(vma_prot); + + return vma_prot; +} + +#ifdef CONFIG_STRICT_DEVMEM +/* This check is done in drivers/char/mem.c in case of STRICT_DEVMEM */ +static inline int range_is_allowed(unsigned long pfn, unsigned long size) +{ + return 1; +} +#else +/* This check is needed to avoid cache aliasing when PAT is enabled */ +static inline int range_is_allowed(unsigned long pfn, unsigned long size) +{ + u64 from = ((u64)pfn) << PAGE_SHIFT; + u64 to = from + size; + u64 cursor = from; + + if (!pat_enabled()) + return 1; + + while (cursor < to) { + if (!devmem_is_allowed(pfn)) + return 0; + cursor += PAGE_SIZE; + pfn++; + } + return 1; +} +#endif /* CONFIG_STRICT_DEVMEM */ + +int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, + unsigned long size, pgprot_t *vma_prot) +{ + enum page_cache_mode pcm = _PAGE_CACHE_MODE_WB; + + if (!range_is_allowed(pfn, size)) + return 0; + + if (file->f_flags & O_DSYNC) + pcm = _PAGE_CACHE_MODE_UC_MINUS; + + *vma_prot = __pgprot((pgprot_val(*vma_prot) & ~_PAGE_CACHE_MASK) | + cachemode2protval(pcm)); + return 1; +} + +/* + * Change the memory type for the physical address range in kernel identity + * mapping space if that range is a part of identity map. + */ +int memtype_kernel_map_sync(u64 base, unsigned long size, + enum page_cache_mode pcm) +{ + unsigned long id_sz; + + if (base > __pa(high_memory-1)) + return 0; + + /* + * Some areas in the middle of the kernel identity range + * are not mapped, for example the PCI space. + */ + if (!page_is_ram(base >> PAGE_SHIFT)) + return 0; + + id_sz = (__pa(high_memory-1) <= base + size) ? + __pa(high_memory) - base : size; + + if (ioremap_change_attr((unsigned long)__va(base), id_sz, pcm) < 0) { + pr_info("x86/PAT: %s:%d ioremap_change_attr failed %s for [mem %#010Lx-%#010Lx]\n", + current->comm, current->pid, + cattr_name(pcm), + base, (unsigned long long)(base + size-1)); + return -EINVAL; + } + return 0; +} + +/* + * Internal interface to reserve a range of physical memory with prot. + * Reserved non RAM regions only and after successful memtype_reserve, + * this func also keeps identity mapping (if any) in sync with this new prot. + */ +static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot, + int strict_prot) +{ + int is_ram = 0; + int ret; + enum page_cache_mode want_pcm = pgprot2cachemode(*vma_prot); + enum page_cache_mode pcm = want_pcm; + + is_ram = pat_pagerange_is_ram(paddr, paddr + size); + + /* + * reserve_pfn_range() for RAM pages. We do not refcount to keep + * track of number of mappings of RAM pages. We can assert that + * the type requested matches the type of first page in the range. + */ + if (is_ram) { + if (!pat_enabled()) + return 0; + + pcm = lookup_memtype(paddr); + if (want_pcm != pcm) { + pr_warn("x86/PAT: %s:%d map pfn RAM range req %s for [mem %#010Lx-%#010Lx], got %s\n", + current->comm, current->pid, + cattr_name(want_pcm), + (unsigned long long)paddr, + (unsigned long long)(paddr + size - 1), + cattr_name(pcm)); + *vma_prot = __pgprot((pgprot_val(*vma_prot) & + (~_PAGE_CACHE_MASK)) | + cachemode2protval(pcm)); + } + return 0; + } + + ret = memtype_reserve(paddr, paddr + size, want_pcm, &pcm); + if (ret) + return ret; + + if (pcm != want_pcm) { + if (strict_prot || + !is_new_memtype_allowed(paddr, size, want_pcm, pcm)) { + memtype_free(paddr, paddr + size); + pr_err("x86/PAT: %s:%d map pfn expected mapping type %s for [mem %#010Lx-%#010Lx], got %s\n", + current->comm, current->pid, + cattr_name(want_pcm), + (unsigned long long)paddr, + (unsigned long long)(paddr + size - 1), + cattr_name(pcm)); + return -EINVAL; + } + /* + * We allow returning different type than the one requested in + * non strict case. + */ + *vma_prot = __pgprot((pgprot_val(*vma_prot) & + (~_PAGE_CACHE_MASK)) | + cachemode2protval(pcm)); + } + + if (memtype_kernel_map_sync(paddr, size, pcm) < 0) { + memtype_free(paddr, paddr + size); + return -EINVAL; + } + return 0; +} + +/* + * Internal interface to free a range of physical memory. + * Frees non RAM regions only. + */ +static void free_pfn_range(u64 paddr, unsigned long size) +{ + int is_ram; + + is_ram = pat_pagerange_is_ram(paddr, paddr + size); + if (is_ram == 0) + memtype_free(paddr, paddr + size); +} + +/* + * track_pfn_copy is called when vma that is covering the pfnmap gets + * copied through copy_page_range(). + * + * If the vma has a linear pfn mapping for the entire range, we get the prot + * from pte and reserve the entire vma range with single reserve_pfn_range call. + */ +int track_pfn_copy(struct vm_area_struct *vma) +{ + resource_size_t paddr; + unsigned long prot; + unsigned long vma_size = vma->vm_end - vma->vm_start; + pgprot_t pgprot; + + if (vma->vm_flags & VM_PAT) { + /* + * reserve the whole chunk covered by vma. We need the + * starting address and protection from pte. + */ + if (follow_phys(vma, vma->vm_start, 0, &prot, &paddr)) { + WARN_ON_ONCE(1); + return -EINVAL; + } + pgprot = __pgprot(prot); + return reserve_pfn_range(paddr, vma_size, &pgprot, 1); + } + + return 0; +} + +/* + * prot is passed in as a parameter for the new mapping. If the vma has + * a linear pfn mapping for the entire range, or no vma is provided, + * reserve the entire pfn + size range with single reserve_pfn_range + * call. + */ +int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, + unsigned long pfn, unsigned long addr, unsigned long size) +{ + resource_size_t paddr = (resource_size_t)pfn << PAGE_SHIFT; + enum page_cache_mode pcm; + + /* reserve the whole chunk starting from paddr */ + if (!vma || (addr == vma->vm_start + && size == (vma->vm_end - vma->vm_start))) { + int ret; + + ret = reserve_pfn_range(paddr, size, prot, 0); + if (ret == 0 && vma) + vm_flags_set(vma, VM_PAT); + return ret; + } + + if (!pat_enabled()) + return 0; + + /* + * For anything smaller than the vma size we set prot based on the + * lookup. + */ + pcm = lookup_memtype(paddr); + + /* Check memtype for the remaining pages */ + while (size > PAGE_SIZE) { + size -= PAGE_SIZE; + paddr += PAGE_SIZE; + if (pcm != lookup_memtype(paddr)) + return -EINVAL; + } + + *prot = __pgprot((pgprot_val(*prot) & (~_PAGE_CACHE_MASK)) | + cachemode2protval(pcm)); + + return 0; +} + +void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, pfn_t pfn) +{ + enum page_cache_mode pcm; + + if (!pat_enabled()) + return; + + /* Set prot based on lookup */ + pcm = lookup_memtype(pfn_t_to_phys(pfn)); + *prot = __pgprot((pgprot_val(*prot) & (~_PAGE_CACHE_MASK)) | + cachemode2protval(pcm)); +} + +/* + * untrack_pfn is called while unmapping a pfnmap for a region. + * untrack can be called for a specific region indicated by pfn and size or + * can be for the entire vma (in which case pfn, size are zero). + */ +void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, + unsigned long size, bool mm_wr_locked) +{ + resource_size_t paddr; + unsigned long prot; + + if (vma && !(vma->vm_flags & VM_PAT)) + return; + + /* free the chunk starting from pfn or the whole chunk */ + paddr = (resource_size_t)pfn << PAGE_SHIFT; + if (!paddr && !size) { + if (follow_phys(vma, vma->vm_start, 0, &prot, &paddr)) { + WARN_ON_ONCE(1); + return; + } + + size = vma->vm_end - vma->vm_start; + } + free_pfn_range(paddr, size); + if (vma) { + if (mm_wr_locked) + vm_flags_clear(vma, VM_PAT); + else + __vm_flags_mod(vma, 0, VM_PAT); + } +} + +/* + * untrack_pfn_clear is called if the following situation fits: + * + * 1) while mremapping a pfnmap for a new region, with the old vma after + * its pfnmap page table has been removed. The new vma has a new pfnmap + * to the same pfn & cache type with VM_PAT set. + * 2) while duplicating vm area, the new vma fails to copy the pgtable from + * old vma. + */ +void untrack_pfn_clear(struct vm_area_struct *vma) +{ + vm_flags_clear(vma, VM_PAT); +} + +pgprot_t pgprot_writecombine(pgprot_t prot) +{ + return __pgprot(pgprot_val(prot) | + cachemode2protval(_PAGE_CACHE_MODE_WC)); +} +EXPORT_SYMBOL_GPL(pgprot_writecombine); + +pgprot_t pgprot_writethrough(pgprot_t prot) +{ + return __pgprot(pgprot_val(prot) | + cachemode2protval(_PAGE_CACHE_MODE_WT)); +} +EXPORT_SYMBOL_GPL(pgprot_writethrough); + +#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_X86_PAT) + +/* + * We are allocating a temporary printout-entry to be passed + * between seq_start()/next() and seq_show(): + */ +static struct memtype *memtype_get_idx(loff_t pos) +{ + struct memtype *entry_print; + int ret; + + entry_print = kzalloc(sizeof(struct memtype), GFP_KERNEL); + if (!entry_print) + return NULL; + + spin_lock(&memtype_lock); + ret = memtype_copy_nth_element(entry_print, pos); + spin_unlock(&memtype_lock); + + /* Free it on error: */ + if (ret) { + kfree(entry_print); + return NULL; + } + + return entry_print; +} + +static void *memtype_seq_start(struct seq_file *seq, loff_t *pos) +{ + if (*pos == 0) { + ++*pos; + seq_puts(seq, "PAT memtype list:\n"); + } + + return memtype_get_idx(*pos); +} + +static void *memtype_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + kfree(v); + ++*pos; + return memtype_get_idx(*pos); +} + +static void memtype_seq_stop(struct seq_file *seq, void *v) +{ + kfree(v); +} + +static int memtype_seq_show(struct seq_file *seq, void *v) +{ + struct memtype *entry_print = (struct memtype *)v; + + seq_printf(seq, "PAT: [mem 0x%016Lx-0x%016Lx] %s\n", + entry_print->start, + entry_print->end, + cattr_name(entry_print->type)); + + return 0; +} + +static const struct seq_operations memtype_seq_ops = { + .start = memtype_seq_start, + .next = memtype_seq_next, + .stop = memtype_seq_stop, + .show = memtype_seq_show, +}; + +static int memtype_seq_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &memtype_seq_ops); +} + +static const struct file_operations memtype_fops = { + .open = memtype_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int __init pat_memtype_list_init(void) +{ + if (pat_enabled()) { + debugfs_create_file("pat_memtype_list", S_IRUSR, + arch_debugfs_dir, NULL, &memtype_fops); + } + return 0; +} +late_initcall(pat_memtype_list_init); + +#endif /* CONFIG_DEBUG_FS && CONFIG_X86_PAT */ diff --git a/arch/x86/mm/pat/memtype.h b/arch/x86/mm/pat/memtype.h new file mode 100644 index 0000000000..cacecdbceb --- /dev/null +++ b/arch/x86/mm/pat/memtype.h @@ -0,0 +1,49 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __MEMTYPE_H_ +#define __MEMTYPE_H_ + +extern int pat_debug_enable; + +#define dprintk(fmt, arg...) \ + do { if (pat_debug_enable) pr_info("x86/PAT: " fmt, ##arg); } while (0) + +struct memtype { + u64 start; + u64 end; + u64 subtree_max_end; + enum page_cache_mode type; + struct rb_node rb; +}; + +static inline char *cattr_name(enum page_cache_mode pcm) +{ + switch (pcm) { + case _PAGE_CACHE_MODE_UC: return "uncached"; + case _PAGE_CACHE_MODE_UC_MINUS: return "uncached-minus"; + case _PAGE_CACHE_MODE_WB: return "write-back"; + case _PAGE_CACHE_MODE_WC: return "write-combining"; + case _PAGE_CACHE_MODE_WT: return "write-through"; + case _PAGE_CACHE_MODE_WP: return "write-protected"; + default: return "broken"; + } +} + +#ifdef CONFIG_X86_PAT +extern int memtype_check_insert(struct memtype *entry_new, + enum page_cache_mode *new_type); +extern struct memtype *memtype_erase(u64 start, u64 end); +extern struct memtype *memtype_lookup(u64 addr); +extern int memtype_copy_nth_element(struct memtype *entry_out, loff_t pos); +#else +static inline int memtype_check_insert(struct memtype *entry_new, + enum page_cache_mode *new_type) +{ return 0; } +static inline struct memtype *memtype_erase(u64 start, u64 end) +{ return NULL; } +static inline struct memtype *memtype_lookup(u64 addr) +{ return NULL; } +static inline int memtype_copy_nth_element(struct memtype *out, loff_t pos) +{ return 0; } +#endif + +#endif /* __MEMTYPE_H_ */ diff --git a/arch/x86/mm/pat/memtype_interval.c b/arch/x86/mm/pat/memtype_interval.c new file mode 100644 index 0000000000..645613d599 --- /dev/null +++ b/arch/x86/mm/pat/memtype_interval.c @@ -0,0 +1,194 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Handle caching attributes in page tables (PAT) + * + * Authors: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> + * Suresh B Siddha <suresh.b.siddha@intel.com> + * + * Interval tree used to store the PAT memory type reservations. + */ + +#include <linux/seq_file.h> +#include <linux/debugfs.h> +#include <linux/kernel.h> +#include <linux/interval_tree_generic.h> +#include <linux/sched.h> +#include <linux/gfp.h> +#include <linux/pgtable.h> + +#include <asm/memtype.h> + +#include "memtype.h" + +/* + * The memtype tree keeps track of memory type for specific + * physical memory areas. Without proper tracking, conflicting memory + * types in different mappings can cause CPU cache corruption. + * + * The tree is an interval tree (augmented rbtree) which tree is ordered + * by the starting address. The tree can contain multiple entries for + * different regions which overlap. All the aliases have the same + * cache attributes of course, as enforced by the PAT logic. + * + * memtype_lock protects the rbtree. + */ + +static inline u64 interval_start(struct memtype *entry) +{ + return entry->start; +} + +static inline u64 interval_end(struct memtype *entry) +{ + return entry->end - 1; +} + +INTERVAL_TREE_DEFINE(struct memtype, rb, u64, subtree_max_end, + interval_start, interval_end, + static, interval) + +static struct rb_root_cached memtype_rbroot = RB_ROOT_CACHED; + +enum { + MEMTYPE_EXACT_MATCH = 0, + MEMTYPE_END_MATCH = 1 +}; + +static struct memtype *memtype_match(u64 start, u64 end, int match_type) +{ + struct memtype *entry_match; + + entry_match = interval_iter_first(&memtype_rbroot, start, end-1); + + while (entry_match != NULL && entry_match->start < end) { + if ((match_type == MEMTYPE_EXACT_MATCH) && + (entry_match->start == start) && (entry_match->end == end)) + return entry_match; + + if ((match_type == MEMTYPE_END_MATCH) && + (entry_match->start < start) && (entry_match->end == end)) + return entry_match; + + entry_match = interval_iter_next(entry_match, start, end-1); + } + + return NULL; /* Returns NULL if there is no match */ +} + +static int memtype_check_conflict(u64 start, u64 end, + enum page_cache_mode reqtype, + enum page_cache_mode *newtype) +{ + struct memtype *entry_match; + enum page_cache_mode found_type = reqtype; + + entry_match = interval_iter_first(&memtype_rbroot, start, end-1); + if (entry_match == NULL) + goto success; + + if (entry_match->type != found_type && newtype == NULL) + goto failure; + + dprintk("Overlap at 0x%Lx-0x%Lx\n", entry_match->start, entry_match->end); + found_type = entry_match->type; + + entry_match = interval_iter_next(entry_match, start, end-1); + while (entry_match) { + if (entry_match->type != found_type) + goto failure; + + entry_match = interval_iter_next(entry_match, start, end-1); + } +success: + if (newtype) + *newtype = found_type; + + return 0; + +failure: + pr_info("x86/PAT: %s:%d conflicting memory types %Lx-%Lx %s<->%s\n", + current->comm, current->pid, start, end, + cattr_name(found_type), cattr_name(entry_match->type)); + + return -EBUSY; +} + +int memtype_check_insert(struct memtype *entry_new, enum page_cache_mode *ret_type) +{ + int err = 0; + + err = memtype_check_conflict(entry_new->start, entry_new->end, entry_new->type, ret_type); + if (err) + return err; + + if (ret_type) + entry_new->type = *ret_type; + + interval_insert(entry_new, &memtype_rbroot); + return 0; +} + +struct memtype *memtype_erase(u64 start, u64 end) +{ + struct memtype *entry_old; + + /* + * Since the memtype_rbroot tree allows overlapping ranges, + * memtype_erase() checks with EXACT_MATCH first, i.e. free + * a whole node for the munmap case. If no such entry is found, + * it then checks with END_MATCH, i.e. shrink the size of a node + * from the end for the mremap case. + */ + entry_old = memtype_match(start, end, MEMTYPE_EXACT_MATCH); + if (!entry_old) { + entry_old = memtype_match(start, end, MEMTYPE_END_MATCH); + if (!entry_old) + return ERR_PTR(-EINVAL); + } + + if (entry_old->start == start) { + /* munmap: erase this node */ + interval_remove(entry_old, &memtype_rbroot); + } else { + /* mremap: update the end value of this node */ + interval_remove(entry_old, &memtype_rbroot); + entry_old->end = start; + interval_insert(entry_old, &memtype_rbroot); + + return NULL; + } + + return entry_old; +} + +struct memtype *memtype_lookup(u64 addr) +{ + return interval_iter_first(&memtype_rbroot, addr, addr + PAGE_SIZE-1); +} + +/* + * Debugging helper, copy the Nth entry of the tree into a + * a copy for printout. This allows us to print out the tree + * via debugfs, without holding the memtype_lock too long: + */ +#ifdef CONFIG_DEBUG_FS +int memtype_copy_nth_element(struct memtype *entry_out, loff_t pos) +{ + struct memtype *entry_match; + int i = 1; + + entry_match = interval_iter_first(&memtype_rbroot, 0, ULONG_MAX); + + while (entry_match && pos != i) { + entry_match = interval_iter_next(entry_match, 0, ULONG_MAX); + i++; + } + + if (entry_match) { /* pos == i */ + *entry_out = *entry_match; + return 0; + } else { + return 1; + } +} +#endif diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c new file mode 100644 index 0000000000..bda9f12983 --- /dev/null +++ b/arch/x86/mm/pat/set_memory.c @@ -0,0 +1,2477 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2002 Andi Kleen, SuSE Labs. + * Thanks to Ben LaHaise for precious feedback. + */ +#include <linux/highmem.h> +#include <linux/memblock.h> +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/interrupt.h> +#include <linux/seq_file.h> +#include <linux/proc_fs.h> +#include <linux/debugfs.h> +#include <linux/pfn.h> +#include <linux/percpu.h> +#include <linux/gfp.h> +#include <linux/pci.h> +#include <linux/vmalloc.h> +#include <linux/libnvdimm.h> +#include <linux/vmstat.h> +#include <linux/kernel.h> +#include <linux/cc_platform.h> +#include <linux/set_memory.h> +#include <linux/memregion.h> + +#include <asm/e820/api.h> +#include <asm/processor.h> +#include <asm/tlbflush.h> +#include <asm/sections.h> +#include <asm/setup.h> +#include <linux/uaccess.h> +#include <asm/pgalloc.h> +#include <asm/proto.h> +#include <asm/memtype.h> +#include <asm/hyperv-tlfs.h> +#include <asm/mshyperv.h> + +#include "../mm_internal.h" + +/* + * The current flushing context - we pass it instead of 5 arguments: + */ +struct cpa_data { + unsigned long *vaddr; + pgd_t *pgd; + pgprot_t mask_set; + pgprot_t mask_clr; + unsigned long numpages; + unsigned long curpage; + unsigned long pfn; + unsigned int flags; + unsigned int force_split : 1, + force_static_prot : 1, + force_flush_all : 1; + struct page **pages; +}; + +enum cpa_warn { + CPA_CONFLICT, + CPA_PROTECT, + CPA_DETECT, +}; + +static const int cpa_warn_level = CPA_PROTECT; + +/* + * Serialize cpa() (for !DEBUG_PAGEALLOC which uses large identity mappings) + * using cpa_lock. So that we don't allow any other cpu, with stale large tlb + * entries change the page attribute in parallel to some other cpu + * splitting a large page entry along with changing the attribute. + */ +static DEFINE_SPINLOCK(cpa_lock); + +#define CPA_FLUSHTLB 1 +#define CPA_ARRAY 2 +#define CPA_PAGES_ARRAY 4 +#define CPA_NO_CHECK_ALIAS 8 /* Do not search for aliases */ + +static inline pgprot_t cachemode2pgprot(enum page_cache_mode pcm) +{ + return __pgprot(cachemode2protval(pcm)); +} + +#ifdef CONFIG_PROC_FS +static unsigned long direct_pages_count[PG_LEVEL_NUM]; + +void update_page_count(int level, unsigned long pages) +{ + /* Protect against CPA */ + spin_lock(&pgd_lock); + direct_pages_count[level] += pages; + spin_unlock(&pgd_lock); +} + +static void split_page_count(int level) +{ + if (direct_pages_count[level] == 0) + return; + + direct_pages_count[level]--; + if (system_state == SYSTEM_RUNNING) { + if (level == PG_LEVEL_2M) + count_vm_event(DIRECT_MAP_LEVEL2_SPLIT); + else if (level == PG_LEVEL_1G) + count_vm_event(DIRECT_MAP_LEVEL3_SPLIT); + } + direct_pages_count[level - 1] += PTRS_PER_PTE; +} + +void arch_report_meminfo(struct seq_file *m) +{ + seq_printf(m, "DirectMap4k: %8lu kB\n", + direct_pages_count[PG_LEVEL_4K] << 2); +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) + seq_printf(m, "DirectMap2M: %8lu kB\n", + direct_pages_count[PG_LEVEL_2M] << 11); +#else + seq_printf(m, "DirectMap4M: %8lu kB\n", + direct_pages_count[PG_LEVEL_2M] << 12); +#endif + if (direct_gbpages) + seq_printf(m, "DirectMap1G: %8lu kB\n", + direct_pages_count[PG_LEVEL_1G] << 20); +} +#else +static inline void split_page_count(int level) { } +#endif + +#ifdef CONFIG_X86_CPA_STATISTICS + +static unsigned long cpa_1g_checked; +static unsigned long cpa_1g_sameprot; +static unsigned long cpa_1g_preserved; +static unsigned long cpa_2m_checked; +static unsigned long cpa_2m_sameprot; +static unsigned long cpa_2m_preserved; +static unsigned long cpa_4k_install; + +static inline void cpa_inc_1g_checked(void) +{ + cpa_1g_checked++; +} + +static inline void cpa_inc_2m_checked(void) +{ + cpa_2m_checked++; +} + +static inline void cpa_inc_4k_install(void) +{ + data_race(cpa_4k_install++); +} + +static inline void cpa_inc_lp_sameprot(int level) +{ + if (level == PG_LEVEL_1G) + cpa_1g_sameprot++; + else + cpa_2m_sameprot++; +} + +static inline void cpa_inc_lp_preserved(int level) +{ + if (level == PG_LEVEL_1G) + cpa_1g_preserved++; + else + cpa_2m_preserved++; +} + +static int cpastats_show(struct seq_file *m, void *p) +{ + seq_printf(m, "1G pages checked: %16lu\n", cpa_1g_checked); + seq_printf(m, "1G pages sameprot: %16lu\n", cpa_1g_sameprot); + seq_printf(m, "1G pages preserved: %16lu\n", cpa_1g_preserved); + seq_printf(m, "2M pages checked: %16lu\n", cpa_2m_checked); + seq_printf(m, "2M pages sameprot: %16lu\n", cpa_2m_sameprot); + seq_printf(m, "2M pages preserved: %16lu\n", cpa_2m_preserved); + seq_printf(m, "4K pages set-checked: %16lu\n", cpa_4k_install); + return 0; +} + +static int cpastats_open(struct inode *inode, struct file *file) +{ + return single_open(file, cpastats_show, NULL); +} + +static const struct file_operations cpastats_fops = { + .open = cpastats_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init cpa_stats_init(void) +{ + debugfs_create_file("cpa_stats", S_IRUSR, arch_debugfs_dir, NULL, + &cpastats_fops); + return 0; +} +late_initcall(cpa_stats_init); +#else +static inline void cpa_inc_1g_checked(void) { } +static inline void cpa_inc_2m_checked(void) { } +static inline void cpa_inc_4k_install(void) { } +static inline void cpa_inc_lp_sameprot(int level) { } +static inline void cpa_inc_lp_preserved(int level) { } +#endif + + +static inline int +within(unsigned long addr, unsigned long start, unsigned long end) +{ + return addr >= start && addr < end; +} + +static inline int +within_inclusive(unsigned long addr, unsigned long start, unsigned long end) +{ + return addr >= start && addr <= end; +} + +#ifdef CONFIG_X86_64 + +/* + * The kernel image is mapped into two places in the virtual address space + * (addresses without KASLR, of course): + * + * 1. The kernel direct map (0xffff880000000000) + * 2. The "high kernel map" (0xffffffff81000000) + * + * We actually execute out of #2. If we get the address of a kernel symbol, it + * points to #2, but almost all physical-to-virtual translations point to #1. + * + * This is so that we can have both a directmap of all physical memory *and* + * take full advantage of the limited (s32) immediate addressing range (2G) + * of x86_64. + * + * See Documentation/arch/x86/x86_64/mm.rst for more detail. + */ + +static inline unsigned long highmap_start_pfn(void) +{ + return __pa_symbol(_text) >> PAGE_SHIFT; +} + +static inline unsigned long highmap_end_pfn(void) +{ + /* Do not reference physical address outside the kernel. */ + return __pa_symbol(roundup(_brk_end, PMD_SIZE) - 1) >> PAGE_SHIFT; +} + +static bool __cpa_pfn_in_highmap(unsigned long pfn) +{ + /* + * Kernel text has an alias mapping at a high address, known + * here as "highmap". + */ + return within_inclusive(pfn, highmap_start_pfn(), highmap_end_pfn()); +} + +#else + +static bool __cpa_pfn_in_highmap(unsigned long pfn) +{ + /* There is no highmap on 32-bit */ + return false; +} + +#endif + +/* + * See set_mce_nospec(). + * + * Machine check recovery code needs to change cache mode of poisoned pages to + * UC to avoid speculative access logging another error. But passing the + * address of the 1:1 mapping to set_memory_uc() is a fine way to encourage a + * speculative access. So we cheat and flip the top bit of the address. This + * works fine for the code that updates the page tables. But at the end of the + * process we need to flush the TLB and cache and the non-canonical address + * causes a #GP fault when used by the INVLPG and CLFLUSH instructions. + * + * But in the common case we already have a canonical address. This code + * will fix the top bit if needed and is a no-op otherwise. + */ +static inline unsigned long fix_addr(unsigned long addr) +{ +#ifdef CONFIG_X86_64 + return (long)(addr << 1) >> 1; +#else + return addr; +#endif +} + +static unsigned long __cpa_addr(struct cpa_data *cpa, unsigned long idx) +{ + if (cpa->flags & CPA_PAGES_ARRAY) { + struct page *page = cpa->pages[idx]; + + if (unlikely(PageHighMem(page))) + return 0; + + return (unsigned long)page_address(page); + } + + if (cpa->flags & CPA_ARRAY) + return cpa->vaddr[idx]; + + return *cpa->vaddr + idx * PAGE_SIZE; +} + +/* + * Flushing functions + */ + +static void clflush_cache_range_opt(void *vaddr, unsigned int size) +{ + const unsigned long clflush_size = boot_cpu_data.x86_clflush_size; + void *p = (void *)((unsigned long)vaddr & ~(clflush_size - 1)); + void *vend = vaddr + size; + + if (p >= vend) + return; + + for (; p < vend; p += clflush_size) + clflushopt(p); +} + +/** + * clflush_cache_range - flush a cache range with clflush + * @vaddr: virtual start address + * @size: number of bytes to flush + * + * CLFLUSHOPT is an unordered instruction which needs fencing with MFENCE or + * SFENCE to avoid ordering issues. + */ +void clflush_cache_range(void *vaddr, unsigned int size) +{ + mb(); + clflush_cache_range_opt(vaddr, size); + mb(); +} +EXPORT_SYMBOL_GPL(clflush_cache_range); + +#ifdef CONFIG_ARCH_HAS_PMEM_API +void arch_invalidate_pmem(void *addr, size_t size) +{ + clflush_cache_range(addr, size); +} +EXPORT_SYMBOL_GPL(arch_invalidate_pmem); +#endif + +#ifdef CONFIG_ARCH_HAS_CPU_CACHE_INVALIDATE_MEMREGION +bool cpu_cache_has_invalidate_memregion(void) +{ + return !cpu_feature_enabled(X86_FEATURE_HYPERVISOR); +} +EXPORT_SYMBOL_NS_GPL(cpu_cache_has_invalidate_memregion, DEVMEM); + +int cpu_cache_invalidate_memregion(int res_desc) +{ + if (WARN_ON_ONCE(!cpu_cache_has_invalidate_memregion())) + return -ENXIO; + wbinvd_on_all_cpus(); + return 0; +} +EXPORT_SYMBOL_NS_GPL(cpu_cache_invalidate_memregion, DEVMEM); +#endif + +static void __cpa_flush_all(void *arg) +{ + unsigned long cache = (unsigned long)arg; + + /* + * Flush all to work around Errata in early athlons regarding + * large page flushing. + */ + __flush_tlb_all(); + + if (cache && boot_cpu_data.x86 >= 4) + wbinvd(); +} + +static void cpa_flush_all(unsigned long cache) +{ + BUG_ON(irqs_disabled() && !early_boot_irqs_disabled); + + on_each_cpu(__cpa_flush_all, (void *) cache, 1); +} + +static void __cpa_flush_tlb(void *data) +{ + struct cpa_data *cpa = data; + unsigned int i; + + for (i = 0; i < cpa->numpages; i++) + flush_tlb_one_kernel(fix_addr(__cpa_addr(cpa, i))); +} + +static void cpa_flush(struct cpa_data *data, int cache) +{ + struct cpa_data *cpa = data; + unsigned int i; + + BUG_ON(irqs_disabled() && !early_boot_irqs_disabled); + + if (cache && !static_cpu_has(X86_FEATURE_CLFLUSH)) { + cpa_flush_all(cache); + return; + } + + if (cpa->force_flush_all || cpa->numpages > tlb_single_page_flush_ceiling) + flush_tlb_all(); + else + on_each_cpu(__cpa_flush_tlb, cpa, 1); + + if (!cache) + return; + + mb(); + for (i = 0; i < cpa->numpages; i++) { + unsigned long addr = __cpa_addr(cpa, i); + unsigned int level; + + pte_t *pte = lookup_address(addr, &level); + + /* + * Only flush present addresses: + */ + if (pte && (pte_val(*pte) & _PAGE_PRESENT)) + clflush_cache_range_opt((void *)fix_addr(addr), PAGE_SIZE); + } + mb(); +} + +static bool overlaps(unsigned long r1_start, unsigned long r1_end, + unsigned long r2_start, unsigned long r2_end) +{ + return (r1_start <= r2_end && r1_end >= r2_start) || + (r2_start <= r1_end && r2_end >= r1_start); +} + +#ifdef CONFIG_PCI_BIOS +/* + * The BIOS area between 640k and 1Mb needs to be executable for PCI BIOS + * based config access (CONFIG_PCI_GOBIOS) support. + */ +#define BIOS_PFN PFN_DOWN(BIOS_BEGIN) +#define BIOS_PFN_END PFN_DOWN(BIOS_END - 1) + +static pgprotval_t protect_pci_bios(unsigned long spfn, unsigned long epfn) +{ + if (pcibios_enabled && overlaps(spfn, epfn, BIOS_PFN, BIOS_PFN_END)) + return _PAGE_NX; + return 0; +} +#else +static pgprotval_t protect_pci_bios(unsigned long spfn, unsigned long epfn) +{ + return 0; +} +#endif + +/* + * The .rodata section needs to be read-only. Using the pfn catches all + * aliases. This also includes __ro_after_init, so do not enforce until + * kernel_set_to_readonly is true. + */ +static pgprotval_t protect_rodata(unsigned long spfn, unsigned long epfn) +{ + unsigned long epfn_ro, spfn_ro = PFN_DOWN(__pa_symbol(__start_rodata)); + + /* + * Note: __end_rodata is at page aligned and not inclusive, so + * subtract 1 to get the last enforced PFN in the rodata area. + */ + epfn_ro = PFN_DOWN(__pa_symbol(__end_rodata)) - 1; + + if (kernel_set_to_readonly && overlaps(spfn, epfn, spfn_ro, epfn_ro)) + return _PAGE_RW; + return 0; +} + +/* + * Protect kernel text against becoming non executable by forbidding + * _PAGE_NX. This protects only the high kernel mapping (_text -> _etext) + * out of which the kernel actually executes. Do not protect the low + * mapping. + * + * This does not cover __inittext since that is gone after boot. + */ +static pgprotval_t protect_kernel_text(unsigned long start, unsigned long end) +{ + unsigned long t_end = (unsigned long)_etext - 1; + unsigned long t_start = (unsigned long)_text; + + if (overlaps(start, end, t_start, t_end)) + return _PAGE_NX; + return 0; +} + +#if defined(CONFIG_X86_64) +/* + * Once the kernel maps the text as RO (kernel_set_to_readonly is set), + * kernel text mappings for the large page aligned text, rodata sections + * will be always read-only. For the kernel identity mappings covering the + * holes caused by this alignment can be anything that user asks. + * + * This will preserve the large page mappings for kernel text/data at no + * extra cost. + */ +static pgprotval_t protect_kernel_text_ro(unsigned long start, + unsigned long end) +{ + unsigned long t_end = (unsigned long)__end_rodata_hpage_align - 1; + unsigned long t_start = (unsigned long)_text; + unsigned int level; + + if (!kernel_set_to_readonly || !overlaps(start, end, t_start, t_end)) + return 0; + /* + * Don't enforce the !RW mapping for the kernel text mapping, if + * the current mapping is already using small page mapping. No + * need to work hard to preserve large page mappings in this case. + * + * This also fixes the Linux Xen paravirt guest boot failure caused + * by unexpected read-only mappings for kernel identity + * mappings. In this paravirt guest case, the kernel text mapping + * and the kernel identity mapping share the same page-table pages, + * so the protections for kernel text and identity mappings have to + * be the same. + */ + if (lookup_address(start, &level) && (level != PG_LEVEL_4K)) + return _PAGE_RW; + return 0; +} +#else +static pgprotval_t protect_kernel_text_ro(unsigned long start, + unsigned long end) +{ + return 0; +} +#endif + +static inline bool conflicts(pgprot_t prot, pgprotval_t val) +{ + return (pgprot_val(prot) & ~val) != pgprot_val(prot); +} + +static inline void check_conflict(int warnlvl, pgprot_t prot, pgprotval_t val, + unsigned long start, unsigned long end, + unsigned long pfn, const char *txt) +{ + static const char *lvltxt[] = { + [CPA_CONFLICT] = "conflict", + [CPA_PROTECT] = "protect", + [CPA_DETECT] = "detect", + }; + + if (warnlvl > cpa_warn_level || !conflicts(prot, val)) + return; + + pr_warn("CPA %8s %10s: 0x%016lx - 0x%016lx PFN %lx req %016llx prevent %016llx\n", + lvltxt[warnlvl], txt, start, end, pfn, (unsigned long long)pgprot_val(prot), + (unsigned long long)val); +} + +/* + * Certain areas of memory on x86 require very specific protection flags, + * for example the BIOS area or kernel text. Callers don't always get this + * right (again, ioremap() on BIOS memory is not uncommon) so this function + * checks and fixes these known static required protection bits. + */ +static inline pgprot_t static_protections(pgprot_t prot, unsigned long start, + unsigned long pfn, unsigned long npg, + unsigned long lpsize, int warnlvl) +{ + pgprotval_t forbidden, res; + unsigned long end; + + /* + * There is no point in checking RW/NX conflicts when the requested + * mapping is setting the page !PRESENT. + */ + if (!(pgprot_val(prot) & _PAGE_PRESENT)) + return prot; + + /* Operate on the virtual address */ + end = start + npg * PAGE_SIZE - 1; + + res = protect_kernel_text(start, end); + check_conflict(warnlvl, prot, res, start, end, pfn, "Text NX"); + forbidden = res; + + /* + * Special case to preserve a large page. If the change spawns the + * full large page mapping then there is no point to split it + * up. Happens with ftrace and is going to be removed once ftrace + * switched to text_poke(). + */ + if (lpsize != (npg * PAGE_SIZE) || (start & (lpsize - 1))) { + res = protect_kernel_text_ro(start, end); + check_conflict(warnlvl, prot, res, start, end, pfn, "Text RO"); + forbidden |= res; + } + + /* Check the PFN directly */ + res = protect_pci_bios(pfn, pfn + npg - 1); + check_conflict(warnlvl, prot, res, start, end, pfn, "PCIBIOS NX"); + forbidden |= res; + + res = protect_rodata(pfn, pfn + npg - 1); + check_conflict(warnlvl, prot, res, start, end, pfn, "Rodata RO"); + forbidden |= res; + + return __pgprot(pgprot_val(prot) & ~forbidden); +} + +/* + * Validate strict W^X semantics. + */ +static inline pgprot_t verify_rwx(pgprot_t old, pgprot_t new, unsigned long start, + unsigned long pfn, unsigned long npg) +{ + unsigned long end; + + /* + * 32-bit has some unfixable W+X issues, like EFI code + * and writeable data being in the same page. Disable + * detection and enforcement there. + */ + if (IS_ENABLED(CONFIG_X86_32)) + return new; + + /* Only verify when NX is supported: */ + if (!(__supported_pte_mask & _PAGE_NX)) + return new; + + if (!((pgprot_val(old) ^ pgprot_val(new)) & (_PAGE_RW | _PAGE_NX))) + return new; + + if ((pgprot_val(new) & (_PAGE_RW | _PAGE_NX)) != _PAGE_RW) + return new; + + end = start + npg * PAGE_SIZE - 1; + WARN_ONCE(1, "CPA detected W^X violation: %016llx -> %016llx range: 0x%016lx - 0x%016lx PFN %lx\n", + (unsigned long long)pgprot_val(old), + (unsigned long long)pgprot_val(new), + start, end, pfn); + + /* + * For now, allow all permission change attempts by returning the + * attempted permissions. This can 'return old' to actively + * refuse the permission change at a later time. + */ + return new; +} + +/* + * Lookup the page table entry for a virtual address in a specific pgd. + * Return a pointer to the entry and the level of the mapping. + */ +pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address, + unsigned int *level) +{ + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + + *level = PG_LEVEL_NONE; + + if (pgd_none(*pgd)) + return NULL; + + p4d = p4d_offset(pgd, address); + if (p4d_none(*p4d)) + return NULL; + + *level = PG_LEVEL_512G; + if (p4d_large(*p4d) || !p4d_present(*p4d)) + return (pte_t *)p4d; + + pud = pud_offset(p4d, address); + if (pud_none(*pud)) + return NULL; + + *level = PG_LEVEL_1G; + if (pud_large(*pud) || !pud_present(*pud)) + return (pte_t *)pud; + + pmd = pmd_offset(pud, address); + if (pmd_none(*pmd)) + return NULL; + + *level = PG_LEVEL_2M; + if (pmd_large(*pmd) || !pmd_present(*pmd)) + return (pte_t *)pmd; + + *level = PG_LEVEL_4K; + + return pte_offset_kernel(pmd, address); +} + +/* + * Lookup the page table entry for a virtual address. Return a pointer + * to the entry and the level of the mapping. + * + * Note: We return pud and pmd either when the entry is marked large + * or when the present bit is not set. Otherwise we would return a + * pointer to a nonexisting mapping. + */ +pte_t *lookup_address(unsigned long address, unsigned int *level) +{ + return lookup_address_in_pgd(pgd_offset_k(address), address, level); +} +EXPORT_SYMBOL_GPL(lookup_address); + +static pte_t *_lookup_address_cpa(struct cpa_data *cpa, unsigned long address, + unsigned int *level) +{ + if (cpa->pgd) + return lookup_address_in_pgd(cpa->pgd + pgd_index(address), + address, level); + + return lookup_address(address, level); +} + +/* + * Lookup the PMD entry for a virtual address. Return a pointer to the entry + * or NULL if not present. + */ +pmd_t *lookup_pmd_address(unsigned long address) +{ + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + + pgd = pgd_offset_k(address); + if (pgd_none(*pgd)) + return NULL; + + p4d = p4d_offset(pgd, address); + if (p4d_none(*p4d) || p4d_large(*p4d) || !p4d_present(*p4d)) + return NULL; + + pud = pud_offset(p4d, address); + if (pud_none(*pud) || pud_large(*pud) || !pud_present(*pud)) + return NULL; + + return pmd_offset(pud, address); +} + +/* + * This is necessary because __pa() does not work on some + * kinds of memory, like vmalloc() or the alloc_remap() + * areas on 32-bit NUMA systems. The percpu areas can + * end up in this kind of memory, for instance. + * + * This could be optimized, but it is only intended to be + * used at initialization time, and keeping it + * unoptimized should increase the testing coverage for + * the more obscure platforms. + */ +phys_addr_t slow_virt_to_phys(void *__virt_addr) +{ + unsigned long virt_addr = (unsigned long)__virt_addr; + phys_addr_t phys_addr; + unsigned long offset; + enum pg_level level; + pte_t *pte; + + pte = lookup_address(virt_addr, &level); + BUG_ON(!pte); + + /* + * pXX_pfn() returns unsigned long, which must be cast to phys_addr_t + * before being left-shifted PAGE_SHIFT bits -- this trick is to + * make 32-PAE kernel work correctly. + */ + switch (level) { + case PG_LEVEL_1G: + phys_addr = (phys_addr_t)pud_pfn(*(pud_t *)pte) << PAGE_SHIFT; + offset = virt_addr & ~PUD_MASK; + break; + case PG_LEVEL_2M: + phys_addr = (phys_addr_t)pmd_pfn(*(pmd_t *)pte) << PAGE_SHIFT; + offset = virt_addr & ~PMD_MASK; + break; + default: + phys_addr = (phys_addr_t)pte_pfn(*pte) << PAGE_SHIFT; + offset = virt_addr & ~PAGE_MASK; + } + + return (phys_addr_t)(phys_addr | offset); +} +EXPORT_SYMBOL_GPL(slow_virt_to_phys); + +/* + * Set the new pmd in all the pgds we know about: + */ +static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte) +{ + /* change init_mm */ + set_pte_atomic(kpte, pte); +#ifdef CONFIG_X86_32 + if (!SHARED_KERNEL_PMD) { + struct page *page; + + list_for_each_entry(page, &pgd_list, lru) { + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + + pgd = (pgd_t *)page_address(page) + pgd_index(address); + p4d = p4d_offset(pgd, address); + pud = pud_offset(p4d, address); + pmd = pmd_offset(pud, address); + set_pte_atomic((pte_t *)pmd, pte); + } + } +#endif +} + +static pgprot_t pgprot_clear_protnone_bits(pgprot_t prot) +{ + /* + * _PAGE_GLOBAL means "global page" for present PTEs. + * But, it is also used to indicate _PAGE_PROTNONE + * for non-present PTEs. + * + * This ensures that a _PAGE_GLOBAL PTE going from + * present to non-present is not confused as + * _PAGE_PROTNONE. + */ + if (!(pgprot_val(prot) & _PAGE_PRESENT)) + pgprot_val(prot) &= ~_PAGE_GLOBAL; + + return prot; +} + +static int __should_split_large_page(pte_t *kpte, unsigned long address, + struct cpa_data *cpa) +{ + unsigned long numpages, pmask, psize, lpaddr, pfn, old_pfn; + pgprot_t old_prot, new_prot, req_prot, chk_prot; + pte_t new_pte, *tmp; + enum pg_level level; + + /* + * Check for races, another CPU might have split this page + * up already: + */ + tmp = _lookup_address_cpa(cpa, address, &level); + if (tmp != kpte) + return 1; + + switch (level) { + case PG_LEVEL_2M: + old_prot = pmd_pgprot(*(pmd_t *)kpte); + old_pfn = pmd_pfn(*(pmd_t *)kpte); + cpa_inc_2m_checked(); + break; + case PG_LEVEL_1G: + old_prot = pud_pgprot(*(pud_t *)kpte); + old_pfn = pud_pfn(*(pud_t *)kpte); + cpa_inc_1g_checked(); + break; + default: + return -EINVAL; + } + + psize = page_level_size(level); + pmask = page_level_mask(level); + + /* + * Calculate the number of pages, which fit into this large + * page starting at address: + */ + lpaddr = (address + psize) & pmask; + numpages = (lpaddr - address) >> PAGE_SHIFT; + if (numpages < cpa->numpages) + cpa->numpages = numpages; + + /* + * We are safe now. Check whether the new pgprot is the same: + * Convert protection attributes to 4k-format, as cpa->mask* are set + * up accordingly. + */ + + /* Clear PSE (aka _PAGE_PAT) and move PAT bit to correct position */ + req_prot = pgprot_large_2_4k(old_prot); + + pgprot_val(req_prot) &= ~pgprot_val(cpa->mask_clr); + pgprot_val(req_prot) |= pgprot_val(cpa->mask_set); + + /* + * req_prot is in format of 4k pages. It must be converted to large + * page format: the caching mode includes the PAT bit located at + * different bit positions in the two formats. + */ + req_prot = pgprot_4k_2_large(req_prot); + req_prot = pgprot_clear_protnone_bits(req_prot); + if (pgprot_val(req_prot) & _PAGE_PRESENT) + pgprot_val(req_prot) |= _PAGE_PSE; + + /* + * old_pfn points to the large page base pfn. So we need to add the + * offset of the virtual address: + */ + pfn = old_pfn + ((address & (psize - 1)) >> PAGE_SHIFT); + cpa->pfn = pfn; + + /* + * Calculate the large page base address and the number of 4K pages + * in the large page + */ + lpaddr = address & pmask; + numpages = psize >> PAGE_SHIFT; + + /* + * Sanity check that the existing mapping is correct versus the static + * protections. static_protections() guards against !PRESENT, so no + * extra conditional required here. + */ + chk_prot = static_protections(old_prot, lpaddr, old_pfn, numpages, + psize, CPA_CONFLICT); + + if (WARN_ON_ONCE(pgprot_val(chk_prot) != pgprot_val(old_prot))) { + /* + * Split the large page and tell the split code to + * enforce static protections. + */ + cpa->force_static_prot = 1; + return 1; + } + + /* + * Optimization: If the requested pgprot is the same as the current + * pgprot, then the large page can be preserved and no updates are + * required independent of alignment and length of the requested + * range. The above already established that the current pgprot is + * correct, which in consequence makes the requested pgprot correct + * as well if it is the same. The static protection scan below will + * not come to a different conclusion. + */ + if (pgprot_val(req_prot) == pgprot_val(old_prot)) { + cpa_inc_lp_sameprot(level); + return 0; + } + + /* + * If the requested range does not cover the full page, split it up + */ + if (address != lpaddr || cpa->numpages != numpages) + return 1; + + /* + * Check whether the requested pgprot is conflicting with a static + * protection requirement in the large page. + */ + new_prot = static_protections(req_prot, lpaddr, old_pfn, numpages, + psize, CPA_DETECT); + + new_prot = verify_rwx(old_prot, new_prot, lpaddr, old_pfn, numpages); + + /* + * If there is a conflict, split the large page. + * + * There used to be a 4k wise evaluation trying really hard to + * preserve the large pages, but experimentation has shown, that this + * does not help at all. There might be corner cases which would + * preserve one large page occasionally, but it's really not worth the + * extra code and cycles for the common case. + */ + if (pgprot_val(req_prot) != pgprot_val(new_prot)) + return 1; + + /* All checks passed. Update the large page mapping. */ + new_pte = pfn_pte(old_pfn, new_prot); + __set_pmd_pte(kpte, address, new_pte); + cpa->flags |= CPA_FLUSHTLB; + cpa_inc_lp_preserved(level); + return 0; +} + +static int should_split_large_page(pte_t *kpte, unsigned long address, + struct cpa_data *cpa) +{ + int do_split; + + if (cpa->force_split) + return 1; + + spin_lock(&pgd_lock); + do_split = __should_split_large_page(kpte, address, cpa); + spin_unlock(&pgd_lock); + + return do_split; +} + +static void split_set_pte(struct cpa_data *cpa, pte_t *pte, unsigned long pfn, + pgprot_t ref_prot, unsigned long address, + unsigned long size) +{ + unsigned int npg = PFN_DOWN(size); + pgprot_t prot; + + /* + * If should_split_large_page() discovered an inconsistent mapping, + * remove the invalid protection in the split mapping. + */ + if (!cpa->force_static_prot) + goto set; + + /* Hand in lpsize = 0 to enforce the protection mechanism */ + prot = static_protections(ref_prot, address, pfn, npg, 0, CPA_PROTECT); + + if (pgprot_val(prot) == pgprot_val(ref_prot)) + goto set; + + /* + * If this is splitting a PMD, fix it up. PUD splits cannot be + * fixed trivially as that would require to rescan the newly + * installed PMD mappings after returning from split_large_page() + * so an eventual further split can allocate the necessary PTE + * pages. Warn for now and revisit it in case this actually + * happens. + */ + if (size == PAGE_SIZE) + ref_prot = prot; + else + pr_warn_once("CPA: Cannot fixup static protections for PUD split\n"); +set: + set_pte(pte, pfn_pte(pfn, ref_prot)); +} + +static int +__split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address, + struct page *base) +{ + unsigned long lpaddr, lpinc, ref_pfn, pfn, pfninc = 1; + pte_t *pbase = (pte_t *)page_address(base); + unsigned int i, level; + pgprot_t ref_prot; + pte_t *tmp; + + spin_lock(&pgd_lock); + /* + * Check for races, another CPU might have split this page + * up for us already: + */ + tmp = _lookup_address_cpa(cpa, address, &level); + if (tmp != kpte) { + spin_unlock(&pgd_lock); + return 1; + } + + paravirt_alloc_pte(&init_mm, page_to_pfn(base)); + + switch (level) { + case PG_LEVEL_2M: + ref_prot = pmd_pgprot(*(pmd_t *)kpte); + /* + * Clear PSE (aka _PAGE_PAT) and move + * PAT bit to correct position. + */ + ref_prot = pgprot_large_2_4k(ref_prot); + ref_pfn = pmd_pfn(*(pmd_t *)kpte); + lpaddr = address & PMD_MASK; + lpinc = PAGE_SIZE; + break; + + case PG_LEVEL_1G: + ref_prot = pud_pgprot(*(pud_t *)kpte); + ref_pfn = pud_pfn(*(pud_t *)kpte); + pfninc = PMD_SIZE >> PAGE_SHIFT; + lpaddr = address & PUD_MASK; + lpinc = PMD_SIZE; + /* + * Clear the PSE flags if the PRESENT flag is not set + * otherwise pmd_present/pmd_huge will return true + * even on a non present pmd. + */ + if (!(pgprot_val(ref_prot) & _PAGE_PRESENT)) + pgprot_val(ref_prot) &= ~_PAGE_PSE; + break; + + default: + spin_unlock(&pgd_lock); + return 1; + } + + ref_prot = pgprot_clear_protnone_bits(ref_prot); + + /* + * Get the target pfn from the original entry: + */ + pfn = ref_pfn; + for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc, lpaddr += lpinc) + split_set_pte(cpa, pbase + i, pfn, ref_prot, lpaddr, lpinc); + + if (virt_addr_valid(address)) { + unsigned long pfn = PFN_DOWN(__pa(address)); + + if (pfn_range_is_mapped(pfn, pfn + 1)) + split_page_count(level); + } + + /* + * Install the new, split up pagetable. + * + * We use the standard kernel pagetable protections for the new + * pagetable protections, the actual ptes set above control the + * primary protection behavior: + */ + __set_pmd_pte(kpte, address, mk_pte(base, __pgprot(_KERNPG_TABLE))); + + /* + * Do a global flush tlb after splitting the large page + * and before we do the actual change page attribute in the PTE. + * + * Without this, we violate the TLB application note, that says: + * "The TLBs may contain both ordinary and large-page + * translations for a 4-KByte range of linear addresses. This + * may occur if software modifies the paging structures so that + * the page size used for the address range changes. If the two + * translations differ with respect to page frame or attributes + * (e.g., permissions), processor behavior is undefined and may + * be implementation-specific." + * + * We do this global tlb flush inside the cpa_lock, so that we + * don't allow any other cpu, with stale tlb entries change the + * page attribute in parallel, that also falls into the + * just split large page entry. + */ + flush_tlb_all(); + spin_unlock(&pgd_lock); + + return 0; +} + +static int split_large_page(struct cpa_data *cpa, pte_t *kpte, + unsigned long address) +{ + struct page *base; + + if (!debug_pagealloc_enabled()) + spin_unlock(&cpa_lock); + base = alloc_pages(GFP_KERNEL, 0); + if (!debug_pagealloc_enabled()) + spin_lock(&cpa_lock); + if (!base) + return -ENOMEM; + + if (__split_large_page(cpa, kpte, address, base)) + __free_page(base); + + return 0; +} + +static bool try_to_free_pte_page(pte_t *pte) +{ + int i; + + for (i = 0; i < PTRS_PER_PTE; i++) + if (!pte_none(pte[i])) + return false; + + free_page((unsigned long)pte); + return true; +} + +static bool try_to_free_pmd_page(pmd_t *pmd) +{ + int i; + + for (i = 0; i < PTRS_PER_PMD; i++) + if (!pmd_none(pmd[i])) + return false; + + free_page((unsigned long)pmd); + return true; +} + +static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end) +{ + pte_t *pte = pte_offset_kernel(pmd, start); + + while (start < end) { + set_pte(pte, __pte(0)); + + start += PAGE_SIZE; + pte++; + } + + if (try_to_free_pte_page((pte_t *)pmd_page_vaddr(*pmd))) { + pmd_clear(pmd); + return true; + } + return false; +} + +static void __unmap_pmd_range(pud_t *pud, pmd_t *pmd, + unsigned long start, unsigned long end) +{ + if (unmap_pte_range(pmd, start, end)) + if (try_to_free_pmd_page(pud_pgtable(*pud))) + pud_clear(pud); +} + +static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end) +{ + pmd_t *pmd = pmd_offset(pud, start); + + /* + * Not on a 2MB page boundary? + */ + if (start & (PMD_SIZE - 1)) { + unsigned long next_page = (start + PMD_SIZE) & PMD_MASK; + unsigned long pre_end = min_t(unsigned long, end, next_page); + + __unmap_pmd_range(pud, pmd, start, pre_end); + + start = pre_end; + pmd++; + } + + /* + * Try to unmap in 2M chunks. + */ + while (end - start >= PMD_SIZE) { + if (pmd_large(*pmd)) + pmd_clear(pmd); + else + __unmap_pmd_range(pud, pmd, start, start + PMD_SIZE); + + start += PMD_SIZE; + pmd++; + } + + /* + * 4K leftovers? + */ + if (start < end) + return __unmap_pmd_range(pud, pmd, start, end); + + /* + * Try again to free the PMD page if haven't succeeded above. + */ + if (!pud_none(*pud)) + if (try_to_free_pmd_page(pud_pgtable(*pud))) + pud_clear(pud); +} + +static void unmap_pud_range(p4d_t *p4d, unsigned long start, unsigned long end) +{ + pud_t *pud = pud_offset(p4d, start); + + /* + * Not on a GB page boundary? + */ + if (start & (PUD_SIZE - 1)) { + unsigned long next_page = (start + PUD_SIZE) & PUD_MASK; + unsigned long pre_end = min_t(unsigned long, end, next_page); + + unmap_pmd_range(pud, start, pre_end); + + start = pre_end; + pud++; + } + + /* + * Try to unmap in 1G chunks? + */ + while (end - start >= PUD_SIZE) { + + if (pud_large(*pud)) + pud_clear(pud); + else + unmap_pmd_range(pud, start, start + PUD_SIZE); + + start += PUD_SIZE; + pud++; + } + + /* + * 2M leftovers? + */ + if (start < end) + unmap_pmd_range(pud, start, end); + + /* + * No need to try to free the PUD page because we'll free it in + * populate_pgd's error path + */ +} + +static int alloc_pte_page(pmd_t *pmd) +{ + pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL); + if (!pte) + return -1; + + set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE)); + return 0; +} + +static int alloc_pmd_page(pud_t *pud) +{ + pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL); + if (!pmd) + return -1; + + set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); + return 0; +} + +static void populate_pte(struct cpa_data *cpa, + unsigned long start, unsigned long end, + unsigned num_pages, pmd_t *pmd, pgprot_t pgprot) +{ + pte_t *pte; + + pte = pte_offset_kernel(pmd, start); + + pgprot = pgprot_clear_protnone_bits(pgprot); + + while (num_pages-- && start < end) { + set_pte(pte, pfn_pte(cpa->pfn, pgprot)); + + start += PAGE_SIZE; + cpa->pfn++; + pte++; + } +} + +static long populate_pmd(struct cpa_data *cpa, + unsigned long start, unsigned long end, + unsigned num_pages, pud_t *pud, pgprot_t pgprot) +{ + long cur_pages = 0; + pmd_t *pmd; + pgprot_t pmd_pgprot; + + /* + * Not on a 2M boundary? + */ + if (start & (PMD_SIZE - 1)) { + unsigned long pre_end = start + (num_pages << PAGE_SHIFT); + unsigned long next_page = (start + PMD_SIZE) & PMD_MASK; + + pre_end = min_t(unsigned long, pre_end, next_page); + cur_pages = (pre_end - start) >> PAGE_SHIFT; + cur_pages = min_t(unsigned int, num_pages, cur_pages); + + /* + * Need a PTE page? + */ + pmd = pmd_offset(pud, start); + if (pmd_none(*pmd)) + if (alloc_pte_page(pmd)) + return -1; + + populate_pte(cpa, start, pre_end, cur_pages, pmd, pgprot); + + start = pre_end; + } + + /* + * We mapped them all? + */ + if (num_pages == cur_pages) + return cur_pages; + + pmd_pgprot = pgprot_4k_2_large(pgprot); + + while (end - start >= PMD_SIZE) { + + /* + * We cannot use a 1G page so allocate a PMD page if needed. + */ + if (pud_none(*pud)) + if (alloc_pmd_page(pud)) + return -1; + + pmd = pmd_offset(pud, start); + + set_pmd(pmd, pmd_mkhuge(pfn_pmd(cpa->pfn, + canon_pgprot(pmd_pgprot)))); + + start += PMD_SIZE; + cpa->pfn += PMD_SIZE >> PAGE_SHIFT; + cur_pages += PMD_SIZE >> PAGE_SHIFT; + } + + /* + * Map trailing 4K pages. + */ + if (start < end) { + pmd = pmd_offset(pud, start); + if (pmd_none(*pmd)) + if (alloc_pte_page(pmd)) + return -1; + + populate_pte(cpa, start, end, num_pages - cur_pages, + pmd, pgprot); + } + return num_pages; +} + +static int populate_pud(struct cpa_data *cpa, unsigned long start, p4d_t *p4d, + pgprot_t pgprot) +{ + pud_t *pud; + unsigned long end; + long cur_pages = 0; + pgprot_t pud_pgprot; + + end = start + (cpa->numpages << PAGE_SHIFT); + + /* + * Not on a Gb page boundary? => map everything up to it with + * smaller pages. + */ + if (start & (PUD_SIZE - 1)) { + unsigned long pre_end; + unsigned long next_page = (start + PUD_SIZE) & PUD_MASK; + + pre_end = min_t(unsigned long, end, next_page); + cur_pages = (pre_end - start) >> PAGE_SHIFT; + cur_pages = min_t(int, (int)cpa->numpages, cur_pages); + + pud = pud_offset(p4d, start); + + /* + * Need a PMD page? + */ + if (pud_none(*pud)) + if (alloc_pmd_page(pud)) + return -1; + + cur_pages = populate_pmd(cpa, start, pre_end, cur_pages, + pud, pgprot); + if (cur_pages < 0) + return cur_pages; + + start = pre_end; + } + + /* We mapped them all? */ + if (cpa->numpages == cur_pages) + return cur_pages; + + pud = pud_offset(p4d, start); + pud_pgprot = pgprot_4k_2_large(pgprot); + + /* + * Map everything starting from the Gb boundary, possibly with 1G pages + */ + while (boot_cpu_has(X86_FEATURE_GBPAGES) && end - start >= PUD_SIZE) { + set_pud(pud, pud_mkhuge(pfn_pud(cpa->pfn, + canon_pgprot(pud_pgprot)))); + + start += PUD_SIZE; + cpa->pfn += PUD_SIZE >> PAGE_SHIFT; + cur_pages += PUD_SIZE >> PAGE_SHIFT; + pud++; + } + + /* Map trailing leftover */ + if (start < end) { + long tmp; + + pud = pud_offset(p4d, start); + if (pud_none(*pud)) + if (alloc_pmd_page(pud)) + return -1; + + tmp = populate_pmd(cpa, start, end, cpa->numpages - cur_pages, + pud, pgprot); + if (tmp < 0) + return cur_pages; + + cur_pages += tmp; + } + return cur_pages; +} + +/* + * Restrictions for kernel page table do not necessarily apply when mapping in + * an alternate PGD. + */ +static int populate_pgd(struct cpa_data *cpa, unsigned long addr) +{ + pgprot_t pgprot = __pgprot(_KERNPG_TABLE); + pud_t *pud = NULL; /* shut up gcc */ + p4d_t *p4d; + pgd_t *pgd_entry; + long ret; + + pgd_entry = cpa->pgd + pgd_index(addr); + + if (pgd_none(*pgd_entry)) { + p4d = (p4d_t *)get_zeroed_page(GFP_KERNEL); + if (!p4d) + return -1; + + set_pgd(pgd_entry, __pgd(__pa(p4d) | _KERNPG_TABLE)); + } + + /* + * Allocate a PUD page and hand it down for mapping. + */ + p4d = p4d_offset(pgd_entry, addr); + if (p4d_none(*p4d)) { + pud = (pud_t *)get_zeroed_page(GFP_KERNEL); + if (!pud) + return -1; + + set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE)); + } + + pgprot_val(pgprot) &= ~pgprot_val(cpa->mask_clr); + pgprot_val(pgprot) |= pgprot_val(cpa->mask_set); + + ret = populate_pud(cpa, addr, p4d, pgprot); + if (ret < 0) { + /* + * Leave the PUD page in place in case some other CPU or thread + * already found it, but remove any useless entries we just + * added to it. + */ + unmap_pud_range(p4d, addr, + addr + (cpa->numpages << PAGE_SHIFT)); + return ret; + } + + cpa->numpages = ret; + return 0; +} + +static int __cpa_process_fault(struct cpa_data *cpa, unsigned long vaddr, + int primary) +{ + if (cpa->pgd) { + /* + * Right now, we only execute this code path when mapping + * the EFI virtual memory map regions, no other users + * provide a ->pgd value. This may change in the future. + */ + return populate_pgd(cpa, vaddr); + } + + /* + * Ignore all non primary paths. + */ + if (!primary) { + cpa->numpages = 1; + return 0; + } + + /* + * Ignore the NULL PTE for kernel identity mapping, as it is expected + * to have holes. + * Also set numpages to '1' indicating that we processed cpa req for + * one virtual address page and its pfn. TBD: numpages can be set based + * on the initial value and the level returned by lookup_address(). + */ + if (within(vaddr, PAGE_OFFSET, + PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))) { + cpa->numpages = 1; + cpa->pfn = __pa(vaddr) >> PAGE_SHIFT; + return 0; + + } else if (__cpa_pfn_in_highmap(cpa->pfn)) { + /* Faults in the highmap are OK, so do not warn: */ + return -EFAULT; + } else { + WARN(1, KERN_WARNING "CPA: called for zero pte. " + "vaddr = %lx cpa->vaddr = %lx\n", vaddr, + *cpa->vaddr); + + return -EFAULT; + } +} + +static int __change_page_attr(struct cpa_data *cpa, int primary) +{ + unsigned long address; + int do_split, err; + unsigned int level; + pte_t *kpte, old_pte; + + address = __cpa_addr(cpa, cpa->curpage); +repeat: + kpte = _lookup_address_cpa(cpa, address, &level); + if (!kpte) + return __cpa_process_fault(cpa, address, primary); + + old_pte = *kpte; + if (pte_none(old_pte)) + return __cpa_process_fault(cpa, address, primary); + + if (level == PG_LEVEL_4K) { + pte_t new_pte; + pgprot_t old_prot = pte_pgprot(old_pte); + pgprot_t new_prot = pte_pgprot(old_pte); + unsigned long pfn = pte_pfn(old_pte); + + pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr); + pgprot_val(new_prot) |= pgprot_val(cpa->mask_set); + + cpa_inc_4k_install(); + /* Hand in lpsize = 0 to enforce the protection mechanism */ + new_prot = static_protections(new_prot, address, pfn, 1, 0, + CPA_PROTECT); + + new_prot = verify_rwx(old_prot, new_prot, address, pfn, 1); + + new_prot = pgprot_clear_protnone_bits(new_prot); + + /* + * We need to keep the pfn from the existing PTE, + * after all we're only going to change it's attributes + * not the memory it points to + */ + new_pte = pfn_pte(pfn, new_prot); + cpa->pfn = pfn; + /* + * Do we really change anything ? + */ + if (pte_val(old_pte) != pte_val(new_pte)) { + set_pte_atomic(kpte, new_pte); + cpa->flags |= CPA_FLUSHTLB; + } + cpa->numpages = 1; + return 0; + } + + /* + * Check, whether we can keep the large page intact + * and just change the pte: + */ + do_split = should_split_large_page(kpte, address, cpa); + /* + * When the range fits into the existing large page, + * return. cp->numpages and cpa->tlbflush have been updated in + * try_large_page: + */ + if (do_split <= 0) + return do_split; + + /* + * We have to split the large page: + */ + err = split_large_page(cpa, kpte, address); + if (!err) + goto repeat; + + return err; +} + +static int __change_page_attr_set_clr(struct cpa_data *cpa, int primary); + +/* + * Check the directmap and "high kernel map" 'aliases'. + */ +static int cpa_process_alias(struct cpa_data *cpa) +{ + struct cpa_data alias_cpa; + unsigned long laddr = (unsigned long)__va(cpa->pfn << PAGE_SHIFT); + unsigned long vaddr; + int ret; + + if (!pfn_range_is_mapped(cpa->pfn, cpa->pfn + 1)) + return 0; + + /* + * No need to redo, when the primary call touched the direct + * mapping already: + */ + vaddr = __cpa_addr(cpa, cpa->curpage); + if (!(within(vaddr, PAGE_OFFSET, + PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT)))) { + + alias_cpa = *cpa; + alias_cpa.vaddr = &laddr; + alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY); + alias_cpa.curpage = 0; + + /* Directmap always has NX set, do not modify. */ + if (__supported_pte_mask & _PAGE_NX) { + alias_cpa.mask_clr.pgprot &= ~_PAGE_NX; + alias_cpa.mask_set.pgprot &= ~_PAGE_NX; + } + + cpa->force_flush_all = 1; + + ret = __change_page_attr_set_clr(&alias_cpa, 0); + if (ret) + return ret; + } + +#ifdef CONFIG_X86_64 + /* + * If the primary call didn't touch the high mapping already + * and the physical address is inside the kernel map, we need + * to touch the high mapped kernel as well: + */ + if (!within(vaddr, (unsigned long)_text, _brk_end) && + __cpa_pfn_in_highmap(cpa->pfn)) { + unsigned long temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) + + __START_KERNEL_map - phys_base; + alias_cpa = *cpa; + alias_cpa.vaddr = &temp_cpa_vaddr; + alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY); + alias_cpa.curpage = 0; + + /* + * [_text, _brk_end) also covers data, do not modify NX except + * in cases where the highmap is the primary target. + */ + if (__supported_pte_mask & _PAGE_NX) { + alias_cpa.mask_clr.pgprot &= ~_PAGE_NX; + alias_cpa.mask_set.pgprot &= ~_PAGE_NX; + } + + cpa->force_flush_all = 1; + /* + * The high mapping range is imprecise, so ignore the + * return value. + */ + __change_page_attr_set_clr(&alias_cpa, 0); + } +#endif + + return 0; +} + +static int __change_page_attr_set_clr(struct cpa_data *cpa, int primary) +{ + unsigned long numpages = cpa->numpages; + unsigned long rempages = numpages; + int ret = 0; + + /* + * No changes, easy! + */ + if (!(pgprot_val(cpa->mask_set) | pgprot_val(cpa->mask_clr)) && + !cpa->force_split) + return ret; + + while (rempages) { + /* + * Store the remaining nr of pages for the large page + * preservation check. + */ + cpa->numpages = rempages; + /* for array changes, we can't use large page */ + if (cpa->flags & (CPA_ARRAY | CPA_PAGES_ARRAY)) + cpa->numpages = 1; + + if (!debug_pagealloc_enabled()) + spin_lock(&cpa_lock); + ret = __change_page_attr(cpa, primary); + if (!debug_pagealloc_enabled()) + spin_unlock(&cpa_lock); + if (ret) + goto out; + + if (primary && !(cpa->flags & CPA_NO_CHECK_ALIAS)) { + ret = cpa_process_alias(cpa); + if (ret) + goto out; + } + + /* + * Adjust the number of pages with the result of the + * CPA operation. Either a large page has been + * preserved or a single page update happened. + */ + BUG_ON(cpa->numpages > rempages || !cpa->numpages); + rempages -= cpa->numpages; + cpa->curpage += cpa->numpages; + } + +out: + /* Restore the original numpages */ + cpa->numpages = numpages; + return ret; +} + +static int change_page_attr_set_clr(unsigned long *addr, int numpages, + pgprot_t mask_set, pgprot_t mask_clr, + int force_split, int in_flag, + struct page **pages) +{ + struct cpa_data cpa; + int ret, cache; + + memset(&cpa, 0, sizeof(cpa)); + + /* + * Check, if we are requested to set a not supported + * feature. Clearing non-supported features is OK. + */ + mask_set = canon_pgprot(mask_set); + + if (!pgprot_val(mask_set) && !pgprot_val(mask_clr) && !force_split) + return 0; + + /* Ensure we are PAGE_SIZE aligned */ + if (in_flag & CPA_ARRAY) { + int i; + for (i = 0; i < numpages; i++) { + if (addr[i] & ~PAGE_MASK) { + addr[i] &= PAGE_MASK; + WARN_ON_ONCE(1); + } + } + } else if (!(in_flag & CPA_PAGES_ARRAY)) { + /* + * in_flag of CPA_PAGES_ARRAY implies it is aligned. + * No need to check in that case + */ + if (*addr & ~PAGE_MASK) { + *addr &= PAGE_MASK; + /* + * People should not be passing in unaligned addresses: + */ + WARN_ON_ONCE(1); + } + } + + /* Must avoid aliasing mappings in the highmem code */ + kmap_flush_unused(); + + vm_unmap_aliases(); + + cpa.vaddr = addr; + cpa.pages = pages; + cpa.numpages = numpages; + cpa.mask_set = mask_set; + cpa.mask_clr = mask_clr; + cpa.flags = in_flag; + cpa.curpage = 0; + cpa.force_split = force_split; + + ret = __change_page_attr_set_clr(&cpa, 1); + + /* + * Check whether we really changed something: + */ + if (!(cpa.flags & CPA_FLUSHTLB)) + goto out; + + /* + * No need to flush, when we did not set any of the caching + * attributes: + */ + cache = !!pgprot2cachemode(mask_set); + + /* + * On error; flush everything to be sure. + */ + if (ret) { + cpa_flush_all(cache); + goto out; + } + + cpa_flush(&cpa, cache); +out: + return ret; +} + +static inline int change_page_attr_set(unsigned long *addr, int numpages, + pgprot_t mask, int array) +{ + return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0, + (array ? CPA_ARRAY : 0), NULL); +} + +static inline int change_page_attr_clear(unsigned long *addr, int numpages, + pgprot_t mask, int array) +{ + return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0, + (array ? CPA_ARRAY : 0), NULL); +} + +static inline int cpa_set_pages_array(struct page **pages, int numpages, + pgprot_t mask) +{ + return change_page_attr_set_clr(NULL, numpages, mask, __pgprot(0), 0, + CPA_PAGES_ARRAY, pages); +} + +static inline int cpa_clear_pages_array(struct page **pages, int numpages, + pgprot_t mask) +{ + return change_page_attr_set_clr(NULL, numpages, __pgprot(0), mask, 0, + CPA_PAGES_ARRAY, pages); +} + +/* + * __set_memory_prot is an internal helper for callers that have been passed + * a pgprot_t value from upper layers and a reservation has already been taken. + * If you want to set the pgprot to a specific page protocol, use the + * set_memory_xx() functions. + */ +int __set_memory_prot(unsigned long addr, int numpages, pgprot_t prot) +{ + return change_page_attr_set_clr(&addr, numpages, prot, + __pgprot(~pgprot_val(prot)), 0, 0, + NULL); +} + +int _set_memory_uc(unsigned long addr, int numpages) +{ + /* + * for now UC MINUS. see comments in ioremap() + * If you really need strong UC use ioremap_uc(), but note + * that you cannot override IO areas with set_memory_*() as + * these helpers cannot work with IO memory. + */ + return change_page_attr_set(&addr, numpages, + cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS), + 0); +} + +int set_memory_uc(unsigned long addr, int numpages) +{ + int ret; + + /* + * for now UC MINUS. see comments in ioremap() + */ + ret = memtype_reserve(__pa(addr), __pa(addr) + numpages * PAGE_SIZE, + _PAGE_CACHE_MODE_UC_MINUS, NULL); + if (ret) + goto out_err; + + ret = _set_memory_uc(addr, numpages); + if (ret) + goto out_free; + + return 0; + +out_free: + memtype_free(__pa(addr), __pa(addr) + numpages * PAGE_SIZE); +out_err: + return ret; +} +EXPORT_SYMBOL(set_memory_uc); + +int _set_memory_wc(unsigned long addr, int numpages) +{ + int ret; + + ret = change_page_attr_set(&addr, numpages, + cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS), + 0); + if (!ret) { + ret = change_page_attr_set_clr(&addr, numpages, + cachemode2pgprot(_PAGE_CACHE_MODE_WC), + __pgprot(_PAGE_CACHE_MASK), + 0, 0, NULL); + } + return ret; +} + +int set_memory_wc(unsigned long addr, int numpages) +{ + int ret; + + ret = memtype_reserve(__pa(addr), __pa(addr) + numpages * PAGE_SIZE, + _PAGE_CACHE_MODE_WC, NULL); + if (ret) + return ret; + + ret = _set_memory_wc(addr, numpages); + if (ret) + memtype_free(__pa(addr), __pa(addr) + numpages * PAGE_SIZE); + + return ret; +} +EXPORT_SYMBOL(set_memory_wc); + +int _set_memory_wt(unsigned long addr, int numpages) +{ + return change_page_attr_set(&addr, numpages, + cachemode2pgprot(_PAGE_CACHE_MODE_WT), 0); +} + +int _set_memory_wb(unsigned long addr, int numpages) +{ + /* WB cache mode is hard wired to all cache attribute bits being 0 */ + return change_page_attr_clear(&addr, numpages, + __pgprot(_PAGE_CACHE_MASK), 0); +} + +int set_memory_wb(unsigned long addr, int numpages) +{ + int ret; + + ret = _set_memory_wb(addr, numpages); + if (ret) + return ret; + + memtype_free(__pa(addr), __pa(addr) + numpages * PAGE_SIZE); + return 0; +} +EXPORT_SYMBOL(set_memory_wb); + +/* Prevent speculative access to a page by marking it not-present */ +#ifdef CONFIG_X86_64 +int set_mce_nospec(unsigned long pfn) +{ + unsigned long decoy_addr; + int rc; + + /* SGX pages are not in the 1:1 map */ + if (arch_is_platform_page(pfn << PAGE_SHIFT)) + return 0; + /* + * We would like to just call: + * set_memory_XX((unsigned long)pfn_to_kaddr(pfn), 1); + * but doing that would radically increase the odds of a + * speculative access to the poison page because we'd have + * the virtual address of the kernel 1:1 mapping sitting + * around in registers. + * Instead we get tricky. We create a non-canonical address + * that looks just like the one we want, but has bit 63 flipped. + * This relies on set_memory_XX() properly sanitizing any __pa() + * results with __PHYSICAL_MASK or PTE_PFN_MASK. + */ + decoy_addr = (pfn << PAGE_SHIFT) + (PAGE_OFFSET ^ BIT(63)); + + rc = set_memory_np(decoy_addr, 1); + if (rc) + pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn); + return rc; +} + +static int set_memory_p(unsigned long *addr, int numpages) +{ + return change_page_attr_set(addr, numpages, __pgprot(_PAGE_PRESENT), 0); +} + +/* Restore full speculative operation to the pfn. */ +int clear_mce_nospec(unsigned long pfn) +{ + unsigned long addr = (unsigned long) pfn_to_kaddr(pfn); + + return set_memory_p(&addr, 1); +} +EXPORT_SYMBOL_GPL(clear_mce_nospec); +#endif /* CONFIG_X86_64 */ + +int set_memory_x(unsigned long addr, int numpages) +{ + if (!(__supported_pte_mask & _PAGE_NX)) + return 0; + + return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_NX), 0); +} + +int set_memory_nx(unsigned long addr, int numpages) +{ + if (!(__supported_pte_mask & _PAGE_NX)) + return 0; + + return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_NX), 0); +} + +int set_memory_ro(unsigned long addr, int numpages) +{ + return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_RW | _PAGE_DIRTY), 0); +} + +int set_memory_rox(unsigned long addr, int numpages) +{ + pgprot_t clr = __pgprot(_PAGE_RW | _PAGE_DIRTY); + + if (__supported_pte_mask & _PAGE_NX) + clr.pgprot |= _PAGE_NX; + + return change_page_attr_clear(&addr, numpages, clr, 0); +} + +int set_memory_rw(unsigned long addr, int numpages) +{ + return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_RW), 0); +} + +int set_memory_np(unsigned long addr, int numpages) +{ + return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_PRESENT), 0); +} + +int set_memory_np_noalias(unsigned long addr, int numpages) +{ + return change_page_attr_set_clr(&addr, numpages, __pgprot(0), + __pgprot(_PAGE_PRESENT), 0, + CPA_NO_CHECK_ALIAS, NULL); +} + +int set_memory_4k(unsigned long addr, int numpages) +{ + return change_page_attr_set_clr(&addr, numpages, __pgprot(0), + __pgprot(0), 1, 0, NULL); +} + +int set_memory_nonglobal(unsigned long addr, int numpages) +{ + return change_page_attr_clear(&addr, numpages, + __pgprot(_PAGE_GLOBAL), 0); +} + +int set_memory_global(unsigned long addr, int numpages) +{ + return change_page_attr_set(&addr, numpages, + __pgprot(_PAGE_GLOBAL), 0); +} + +/* + * __set_memory_enc_pgtable() is used for the hypervisors that get + * informed about "encryption" status via page tables. + */ +static int __set_memory_enc_pgtable(unsigned long addr, int numpages, bool enc) +{ + pgprot_t empty = __pgprot(0); + struct cpa_data cpa; + int ret; + + /* Should not be working on unaligned addresses */ + if (WARN_ONCE(addr & ~PAGE_MASK, "misaligned address: %#lx\n", addr)) + addr &= PAGE_MASK; + + memset(&cpa, 0, sizeof(cpa)); + cpa.vaddr = &addr; + cpa.numpages = numpages; + cpa.mask_set = enc ? pgprot_encrypted(empty) : pgprot_decrypted(empty); + cpa.mask_clr = enc ? pgprot_decrypted(empty) : pgprot_encrypted(empty); + cpa.pgd = init_mm.pgd; + + /* Must avoid aliasing mappings in the highmem code */ + kmap_flush_unused(); + vm_unmap_aliases(); + + /* Flush the caches as needed before changing the encryption attribute. */ + if (x86_platform.guest.enc_tlb_flush_required(enc)) + cpa_flush(&cpa, x86_platform.guest.enc_cache_flush_required()); + + /* Notify hypervisor that we are about to set/clr encryption attribute. */ + if (!x86_platform.guest.enc_status_change_prepare(addr, numpages, enc)) + return -EIO; + + ret = __change_page_attr_set_clr(&cpa, 1); + + /* + * After changing the encryption attribute, we need to flush TLBs again + * in case any speculative TLB caching occurred (but no need to flush + * caches again). We could just use cpa_flush_all(), but in case TLB + * flushing gets optimized in the cpa_flush() path use the same logic + * as above. + */ + cpa_flush(&cpa, 0); + + /* Notify hypervisor that we have successfully set/clr encryption attribute. */ + if (!ret) { + if (!x86_platform.guest.enc_status_change_finish(addr, numpages, enc)) + ret = -EIO; + } + + return ret; +} + +static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc) +{ + if (cc_platform_has(CC_ATTR_MEM_ENCRYPT)) + return __set_memory_enc_pgtable(addr, numpages, enc); + + return 0; +} + +int set_memory_encrypted(unsigned long addr, int numpages) +{ + return __set_memory_enc_dec(addr, numpages, true); +} +EXPORT_SYMBOL_GPL(set_memory_encrypted); + +int set_memory_decrypted(unsigned long addr, int numpages) +{ + return __set_memory_enc_dec(addr, numpages, false); +} +EXPORT_SYMBOL_GPL(set_memory_decrypted); + +int set_pages_uc(struct page *page, int numpages) +{ + unsigned long addr = (unsigned long)page_address(page); + + return set_memory_uc(addr, numpages); +} +EXPORT_SYMBOL(set_pages_uc); + +static int _set_pages_array(struct page **pages, int numpages, + enum page_cache_mode new_type) +{ + unsigned long start; + unsigned long end; + enum page_cache_mode set_type; + int i; + int free_idx; + int ret; + + for (i = 0; i < numpages; i++) { + if (PageHighMem(pages[i])) + continue; + start = page_to_pfn(pages[i]) << PAGE_SHIFT; + end = start + PAGE_SIZE; + if (memtype_reserve(start, end, new_type, NULL)) + goto err_out; + } + + /* If WC, set to UC- first and then WC */ + set_type = (new_type == _PAGE_CACHE_MODE_WC) ? + _PAGE_CACHE_MODE_UC_MINUS : new_type; + + ret = cpa_set_pages_array(pages, numpages, + cachemode2pgprot(set_type)); + if (!ret && new_type == _PAGE_CACHE_MODE_WC) + ret = change_page_attr_set_clr(NULL, numpages, + cachemode2pgprot( + _PAGE_CACHE_MODE_WC), + __pgprot(_PAGE_CACHE_MASK), + 0, CPA_PAGES_ARRAY, pages); + if (ret) + goto err_out; + return 0; /* Success */ +err_out: + free_idx = i; + for (i = 0; i < free_idx; i++) { + if (PageHighMem(pages[i])) + continue; + start = page_to_pfn(pages[i]) << PAGE_SHIFT; + end = start + PAGE_SIZE; + memtype_free(start, end); + } + return -EINVAL; +} + +int set_pages_array_uc(struct page **pages, int numpages) +{ + return _set_pages_array(pages, numpages, _PAGE_CACHE_MODE_UC_MINUS); +} +EXPORT_SYMBOL(set_pages_array_uc); + +int set_pages_array_wc(struct page **pages, int numpages) +{ + return _set_pages_array(pages, numpages, _PAGE_CACHE_MODE_WC); +} +EXPORT_SYMBOL(set_pages_array_wc); + +int set_pages_wb(struct page *page, int numpages) +{ + unsigned long addr = (unsigned long)page_address(page); + + return set_memory_wb(addr, numpages); +} +EXPORT_SYMBOL(set_pages_wb); + +int set_pages_array_wb(struct page **pages, int numpages) +{ + int retval; + unsigned long start; + unsigned long end; + int i; + + /* WB cache mode is hard wired to all cache attribute bits being 0 */ + retval = cpa_clear_pages_array(pages, numpages, + __pgprot(_PAGE_CACHE_MASK)); + if (retval) + return retval; + + for (i = 0; i < numpages; i++) { + if (PageHighMem(pages[i])) + continue; + start = page_to_pfn(pages[i]) << PAGE_SHIFT; + end = start + PAGE_SIZE; + memtype_free(start, end); + } + + return 0; +} +EXPORT_SYMBOL(set_pages_array_wb); + +int set_pages_ro(struct page *page, int numpages) +{ + unsigned long addr = (unsigned long)page_address(page); + + return set_memory_ro(addr, numpages); +} + +int set_pages_rw(struct page *page, int numpages) +{ + unsigned long addr = (unsigned long)page_address(page); + + return set_memory_rw(addr, numpages); +} + +static int __set_pages_p(struct page *page, int numpages) +{ + unsigned long tempaddr = (unsigned long) page_address(page); + struct cpa_data cpa = { .vaddr = &tempaddr, + .pgd = NULL, + .numpages = numpages, + .mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW), + .mask_clr = __pgprot(0), + .flags = CPA_NO_CHECK_ALIAS }; + + /* + * No alias checking needed for setting present flag. otherwise, + * we may need to break large pages for 64-bit kernel text + * mappings (this adds to complexity if we want to do this from + * atomic context especially). Let's keep it simple! + */ + return __change_page_attr_set_clr(&cpa, 1); +} + +static int __set_pages_np(struct page *page, int numpages) +{ + unsigned long tempaddr = (unsigned long) page_address(page); + struct cpa_data cpa = { .vaddr = &tempaddr, + .pgd = NULL, + .numpages = numpages, + .mask_set = __pgprot(0), + .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW), + .flags = CPA_NO_CHECK_ALIAS }; + + /* + * No alias checking needed for setting not present flag. otherwise, + * we may need to break large pages for 64-bit kernel text + * mappings (this adds to complexity if we want to do this from + * atomic context especially). Let's keep it simple! + */ + return __change_page_attr_set_clr(&cpa, 1); +} + +int set_direct_map_invalid_noflush(struct page *page) +{ + return __set_pages_np(page, 1); +} + +int set_direct_map_default_noflush(struct page *page) +{ + return __set_pages_p(page, 1); +} + +#ifdef CONFIG_DEBUG_PAGEALLOC +void __kernel_map_pages(struct page *page, int numpages, int enable) +{ + if (PageHighMem(page)) + return; + if (!enable) { + debug_check_no_locks_freed(page_address(page), + numpages * PAGE_SIZE); + } + + /* + * The return value is ignored as the calls cannot fail. + * Large pages for identity mappings are not used at boot time + * and hence no memory allocations during large page split. + */ + if (enable) + __set_pages_p(page, numpages); + else + __set_pages_np(page, numpages); + + /* + * We should perform an IPI and flush all tlbs, + * but that can deadlock->flush only current cpu. + * Preemption needs to be disabled around __flush_tlb_all() due to + * CR3 reload in __native_flush_tlb(). + */ + preempt_disable(); + __flush_tlb_all(); + preempt_enable(); + + arch_flush_lazy_mmu_mode(); +} +#endif /* CONFIG_DEBUG_PAGEALLOC */ + +bool kernel_page_present(struct page *page) +{ + unsigned int level; + pte_t *pte; + + if (PageHighMem(page)) + return false; + + pte = lookup_address((unsigned long)page_address(page), &level); + return (pte_val(*pte) & _PAGE_PRESENT); +} + +int __init kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address, + unsigned numpages, unsigned long page_flags) +{ + int retval = -EINVAL; + + struct cpa_data cpa = { + .vaddr = &address, + .pfn = pfn, + .pgd = pgd, + .numpages = numpages, + .mask_set = __pgprot(0), + .mask_clr = __pgprot(~page_flags & (_PAGE_NX|_PAGE_RW)), + .flags = CPA_NO_CHECK_ALIAS, + }; + + WARN_ONCE(num_online_cpus() > 1, "Don't call after initializing SMP"); + + if (!(__supported_pte_mask & _PAGE_NX)) + goto out; + + if (!(page_flags & _PAGE_ENC)) + cpa.mask_clr = pgprot_encrypted(cpa.mask_clr); + + cpa.mask_set = __pgprot(_PAGE_PRESENT | page_flags); + + retval = __change_page_attr_set_clr(&cpa, 1); + __flush_tlb_all(); + +out: + return retval; +} + +/* + * __flush_tlb_all() flushes mappings only on current CPU and hence this + * function shouldn't be used in an SMP environment. Presently, it's used only + * during boot (way before smp_init()) by EFI subsystem and hence is ok. + */ +int __init kernel_unmap_pages_in_pgd(pgd_t *pgd, unsigned long address, + unsigned long numpages) +{ + int retval; + + /* + * The typical sequence for unmapping is to find a pte through + * lookup_address_in_pgd() (ideally, it should never return NULL because + * the address is already mapped) and change it's protections. As pfn is + * the *target* of a mapping, it's not useful while unmapping. + */ + struct cpa_data cpa = { + .vaddr = &address, + .pfn = 0, + .pgd = pgd, + .numpages = numpages, + .mask_set = __pgprot(0), + .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW), + .flags = CPA_NO_CHECK_ALIAS, + }; + + WARN_ONCE(num_online_cpus() > 1, "Don't call after initializing SMP"); + + retval = __change_page_attr_set_clr(&cpa, 1); + __flush_tlb_all(); + + return retval; +} + +/* + * The testcases use internal knowledge of the implementation that shouldn't + * be exposed to the rest of the kernel. Include these directly here. + */ +#ifdef CONFIG_CPA_DEBUG +#include "cpa-test.c" +#endif diff --git a/arch/x86/mm/pf_in.c b/arch/x86/mm/pf_in.c new file mode 100644 index 0000000000..3f83e31b3a --- /dev/null +++ b/arch/x86/mm/pf_in.c @@ -0,0 +1,516 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Fault Injection Test harness (FI) + * Copyright (C) Intel Crop. + */ + +/* Id: pf_in.c,v 1.1.1.1 2002/11/12 05:56:32 brlock Exp + * Copyright by Intel Crop., 2002 + * Louis Zhuang (louis.zhuang@intel.com) + * + * Bjorn Steinbrink (B.Steinbrink@gmx.de), 2007 + */ + +#include <linux/ptrace.h> /* struct pt_regs */ +#include "pf_in.h" + +#ifdef __i386__ +/* IA32 Manual 3, 2-1 */ +static unsigned char prefix_codes[] = { + 0xF0, 0xF2, 0xF3, 0x2E, 0x36, 0x3E, 0x26, 0x64, + 0x65, 0x66, 0x67 +}; +/* IA32 Manual 3, 3-432*/ +static unsigned int reg_rop[] = { + 0x8A, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F +}; +static unsigned int reg_wop[] = { 0x88, 0x89, 0xAA, 0xAB }; +static unsigned int imm_wop[] = { 0xC6, 0xC7 }; +/* IA32 Manual 3, 3-432*/ +static unsigned int rw8[] = { 0x88, 0x8A, 0xC6, 0xAA }; +static unsigned int rw32[] = { + 0x89, 0x8B, 0xC7, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F, 0xAB +}; +static unsigned int mw8[] = { 0x88, 0x8A, 0xC6, 0xB60F, 0xBE0F, 0xAA }; +static unsigned int mw16[] = { 0xB70F, 0xBF0F }; +static unsigned int mw32[] = { 0x89, 0x8B, 0xC7, 0xAB }; +static unsigned int mw64[] = {}; +#else /* not __i386__ */ +static unsigned char prefix_codes[] = { + 0x66, 0x67, 0x2E, 0x3E, 0x26, 0x64, 0x65, 0x36, + 0xF0, 0xF3, 0xF2, + /* REX Prefixes */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f +}; +/* AMD64 Manual 3, Appendix A*/ +static unsigned int reg_rop[] = { + 0x8A, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F +}; +static unsigned int reg_wop[] = { 0x88, 0x89, 0xAA, 0xAB }; +static unsigned int imm_wop[] = { 0xC6, 0xC7 }; +static unsigned int rw8[] = { 0xC6, 0x88, 0x8A, 0xAA }; +static unsigned int rw32[] = { + 0xC7, 0x89, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F, 0xAB +}; +/* 8 bit only */ +static unsigned int mw8[] = { 0xC6, 0x88, 0x8A, 0xB60F, 0xBE0F, 0xAA }; +/* 16 bit only */ +static unsigned int mw16[] = { 0xB70F, 0xBF0F }; +/* 16 or 32 bit */ +static unsigned int mw32[] = { 0xC7 }; +/* 16, 32 or 64 bit */ +static unsigned int mw64[] = { 0x89, 0x8B, 0xAB }; +#endif /* not __i386__ */ + +struct prefix_bits { + unsigned shorted:1; + unsigned enlarged:1; + unsigned rexr:1; + unsigned rex:1; +}; + +static int skip_prefix(unsigned char *addr, struct prefix_bits *prf) +{ + int i; + unsigned char *p = addr; + prf->shorted = 0; + prf->enlarged = 0; + prf->rexr = 0; + prf->rex = 0; + +restart: + for (i = 0; i < ARRAY_SIZE(prefix_codes); i++) { + if (*p == prefix_codes[i]) { + if (*p == 0x66) + prf->shorted = 1; +#ifdef __amd64__ + if ((*p & 0xf8) == 0x48) + prf->enlarged = 1; + if ((*p & 0xf4) == 0x44) + prf->rexr = 1; + if ((*p & 0xf0) == 0x40) + prf->rex = 1; +#endif + p++; + goto restart; + } + } + + return (p - addr); +} + +static int get_opcode(unsigned char *addr, unsigned int *opcode) +{ + int len; + + if (*addr == 0x0F) { + /* 0x0F is extension instruction */ + *opcode = *(unsigned short *)addr; + len = 2; + } else { + *opcode = *addr; + len = 1; + } + + return len; +} + +#define CHECK_OP_TYPE(opcode, array, type) \ + for (i = 0; i < ARRAY_SIZE(array); i++) { \ + if (array[i] == opcode) { \ + rv = type; \ + goto exit; \ + } \ + } + +enum reason_type get_ins_type(unsigned long ins_addr) +{ + unsigned int opcode; + unsigned char *p; + struct prefix_bits prf; + int i; + enum reason_type rv = OTHERS; + + p = (unsigned char *)ins_addr; + p += skip_prefix(p, &prf); + p += get_opcode(p, &opcode); + + CHECK_OP_TYPE(opcode, reg_rop, REG_READ); + CHECK_OP_TYPE(opcode, reg_wop, REG_WRITE); + CHECK_OP_TYPE(opcode, imm_wop, IMM_WRITE); + +exit: + return rv; +} +#undef CHECK_OP_TYPE + +static unsigned int get_ins_reg_width(unsigned long ins_addr) +{ + unsigned int opcode; + unsigned char *p; + struct prefix_bits prf; + int i; + + p = (unsigned char *)ins_addr; + p += skip_prefix(p, &prf); + p += get_opcode(p, &opcode); + + for (i = 0; i < ARRAY_SIZE(rw8); i++) + if (rw8[i] == opcode) + return 1; + + for (i = 0; i < ARRAY_SIZE(rw32); i++) + if (rw32[i] == opcode) + return prf.shorted ? 2 : (prf.enlarged ? 8 : 4); + + printk(KERN_ERR "mmiotrace: Unknown opcode 0x%02x\n", opcode); + return 0; +} + +unsigned int get_ins_mem_width(unsigned long ins_addr) +{ + unsigned int opcode; + unsigned char *p; + struct prefix_bits prf; + int i; + + p = (unsigned char *)ins_addr; + p += skip_prefix(p, &prf); + p += get_opcode(p, &opcode); + + for (i = 0; i < ARRAY_SIZE(mw8); i++) + if (mw8[i] == opcode) + return 1; + + for (i = 0; i < ARRAY_SIZE(mw16); i++) + if (mw16[i] == opcode) + return 2; + + for (i = 0; i < ARRAY_SIZE(mw32); i++) + if (mw32[i] == opcode) + return prf.shorted ? 2 : 4; + + for (i = 0; i < ARRAY_SIZE(mw64); i++) + if (mw64[i] == opcode) + return prf.shorted ? 2 : (prf.enlarged ? 8 : 4); + + printk(KERN_ERR "mmiotrace: Unknown opcode 0x%02x\n", opcode); + return 0; +} + +/* + * Define register ident in mod/rm byte. + * Note: these are NOT the same as in ptrace-abi.h. + */ +enum { + arg_AL = 0, + arg_CL = 1, + arg_DL = 2, + arg_BL = 3, + arg_AH = 4, + arg_CH = 5, + arg_DH = 6, + arg_BH = 7, + + arg_AX = 0, + arg_CX = 1, + arg_DX = 2, + arg_BX = 3, + arg_SP = 4, + arg_BP = 5, + arg_SI = 6, + arg_DI = 7, +#ifdef __amd64__ + arg_R8 = 8, + arg_R9 = 9, + arg_R10 = 10, + arg_R11 = 11, + arg_R12 = 12, + arg_R13 = 13, + arg_R14 = 14, + arg_R15 = 15 +#endif +}; + +static unsigned char *get_reg_w8(int no, int rex, struct pt_regs *regs) +{ + unsigned char *rv = NULL; + + switch (no) { + case arg_AL: + rv = (unsigned char *)®s->ax; + break; + case arg_BL: + rv = (unsigned char *)®s->bx; + break; + case arg_CL: + rv = (unsigned char *)®s->cx; + break; + case arg_DL: + rv = (unsigned char *)®s->dx; + break; +#ifdef __amd64__ + case arg_R8: + rv = (unsigned char *)®s->r8; + break; + case arg_R9: + rv = (unsigned char *)®s->r9; + break; + case arg_R10: + rv = (unsigned char *)®s->r10; + break; + case arg_R11: + rv = (unsigned char *)®s->r11; + break; + case arg_R12: + rv = (unsigned char *)®s->r12; + break; + case arg_R13: + rv = (unsigned char *)®s->r13; + break; + case arg_R14: + rv = (unsigned char *)®s->r14; + break; + case arg_R15: + rv = (unsigned char *)®s->r15; + break; +#endif + default: + break; + } + + if (rv) + return rv; + + if (rex) { + /* + * If REX prefix exists, access low bytes of SI etc. + * instead of AH etc. + */ + switch (no) { + case arg_SI: + rv = (unsigned char *)®s->si; + break; + case arg_DI: + rv = (unsigned char *)®s->di; + break; + case arg_BP: + rv = (unsigned char *)®s->bp; + break; + case arg_SP: + rv = (unsigned char *)®s->sp; + break; + default: + break; + } + } else { + switch (no) { + case arg_AH: + rv = 1 + (unsigned char *)®s->ax; + break; + case arg_BH: + rv = 1 + (unsigned char *)®s->bx; + break; + case arg_CH: + rv = 1 + (unsigned char *)®s->cx; + break; + case arg_DH: + rv = 1 + (unsigned char *)®s->dx; + break; + default: + break; + } + } + + if (!rv) + printk(KERN_ERR "mmiotrace: Error reg no# %d\n", no); + + return rv; +} + +static unsigned long *get_reg_w32(int no, struct pt_regs *regs) +{ + unsigned long *rv = NULL; + + switch (no) { + case arg_AX: + rv = ®s->ax; + break; + case arg_BX: + rv = ®s->bx; + break; + case arg_CX: + rv = ®s->cx; + break; + case arg_DX: + rv = ®s->dx; + break; + case arg_SP: + rv = ®s->sp; + break; + case arg_BP: + rv = ®s->bp; + break; + case arg_SI: + rv = ®s->si; + break; + case arg_DI: + rv = ®s->di; + break; +#ifdef __amd64__ + case arg_R8: + rv = ®s->r8; + break; + case arg_R9: + rv = ®s->r9; + break; + case arg_R10: + rv = ®s->r10; + break; + case arg_R11: + rv = ®s->r11; + break; + case arg_R12: + rv = ®s->r12; + break; + case arg_R13: + rv = ®s->r13; + break; + case arg_R14: + rv = ®s->r14; + break; + case arg_R15: + rv = ®s->r15; + break; +#endif + default: + printk(KERN_ERR "mmiotrace: Error reg no# %d\n", no); + } + + return rv; +} + +unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs) +{ + unsigned int opcode; + int reg; + unsigned char *p; + struct prefix_bits prf; + int i; + + p = (unsigned char *)ins_addr; + p += skip_prefix(p, &prf); + p += get_opcode(p, &opcode); + for (i = 0; i < ARRAY_SIZE(reg_rop); i++) + if (reg_rop[i] == opcode) + goto do_work; + + for (i = 0; i < ARRAY_SIZE(reg_wop); i++) + if (reg_wop[i] == opcode) + goto do_work; + + printk(KERN_ERR "mmiotrace: Not a register instruction, opcode " + "0x%02x\n", opcode); + goto err; + +do_work: + /* for STOS, source register is fixed */ + if (opcode == 0xAA || opcode == 0xAB) { + reg = arg_AX; + } else { + unsigned char mod_rm = *p; + reg = ((mod_rm >> 3) & 0x7) | (prf.rexr << 3); + } + switch (get_ins_reg_width(ins_addr)) { + case 1: + return *get_reg_w8(reg, prf.rex, regs); + + case 2: + return *(unsigned short *)get_reg_w32(reg, regs); + + case 4: + return *(unsigned int *)get_reg_w32(reg, regs); + +#ifdef __amd64__ + case 8: + return *(unsigned long *)get_reg_w32(reg, regs); +#endif + + default: + printk(KERN_ERR "mmiotrace: Error width# %d\n", reg); + } + +err: + return 0; +} + +unsigned long get_ins_imm_val(unsigned long ins_addr) +{ + unsigned int opcode; + unsigned char mod_rm; + unsigned char mod; + unsigned char *p; + struct prefix_bits prf; + int i; + + p = (unsigned char *)ins_addr; + p += skip_prefix(p, &prf); + p += get_opcode(p, &opcode); + for (i = 0; i < ARRAY_SIZE(imm_wop); i++) + if (imm_wop[i] == opcode) + goto do_work; + + printk(KERN_ERR "mmiotrace: Not an immediate instruction, opcode " + "0x%02x\n", opcode); + goto err; + +do_work: + mod_rm = *p; + mod = mod_rm >> 6; + p++; + switch (mod) { + case 0: + /* if r/m is 5 we have a 32 disp (IA32 Manual 3, Table 2-2) */ + /* AMD64: XXX Check for address size prefix? */ + if ((mod_rm & 0x7) == 0x5) + p += 4; + break; + + case 1: + p += 1; + break; + + case 2: + p += 4; + break; + + case 3: + default: + printk(KERN_ERR "mmiotrace: not a memory access instruction " + "at 0x%lx, rm_mod=0x%02x\n", + ins_addr, mod_rm); + } + + switch (get_ins_reg_width(ins_addr)) { + case 1: + return *(unsigned char *)p; + + case 2: + return *(unsigned short *)p; + + case 4: + return *(unsigned int *)p; + +#ifdef __amd64__ + case 8: + return *(unsigned long *)p; +#endif + + default: + printk(KERN_ERR "mmiotrace: Error: width.\n"); + } + +err: + return 0; +} diff --git a/arch/x86/mm/pf_in.h b/arch/x86/mm/pf_in.h new file mode 100644 index 0000000000..e2a13dce0e --- /dev/null +++ b/arch/x86/mm/pf_in.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Fault Injection Test harness (FI) + * Copyright (C) Intel Crop. + */ + +#ifndef __PF_H_ +#define __PF_H_ + +enum reason_type { + NOT_ME, /* page fault is not in regions */ + NOTHING, /* access others point in regions */ + REG_READ, /* read from addr to reg */ + REG_WRITE, /* write from reg to addr */ + IMM_WRITE, /* write from imm to addr */ + OTHERS /* Other instructions can not intercept */ +}; + +enum reason_type get_ins_type(unsigned long ins_addr); +unsigned int get_ins_mem_width(unsigned long ins_addr); +unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs); +unsigned long get_ins_imm_val(unsigned long ins_addr); + +#endif /* __PF_H_ */ diff --git a/arch/x86/mm/pgprot.c b/arch/x86/mm/pgprot.c new file mode 100644 index 0000000000..c84bd9540b --- /dev/null +++ b/arch/x86/mm/pgprot.c @@ -0,0 +1,63 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/export.h> +#include <linux/mm.h> +#include <asm/pgtable.h> +#include <asm/mem_encrypt.h> + +static pgprot_t protection_map[16] __ro_after_init = { + [VM_NONE] = PAGE_NONE, + [VM_READ] = PAGE_READONLY, + [VM_WRITE] = PAGE_COPY, + [VM_WRITE | VM_READ] = PAGE_COPY, + [VM_EXEC] = PAGE_READONLY_EXEC, + [VM_EXEC | VM_READ] = PAGE_READONLY_EXEC, + [VM_EXEC | VM_WRITE] = PAGE_COPY_EXEC, + [VM_EXEC | VM_WRITE | VM_READ] = PAGE_COPY_EXEC, + [VM_SHARED] = PAGE_NONE, + [VM_SHARED | VM_READ] = PAGE_READONLY, + [VM_SHARED | VM_WRITE] = PAGE_SHARED, + [VM_SHARED | VM_WRITE | VM_READ] = PAGE_SHARED, + [VM_SHARED | VM_EXEC] = PAGE_READONLY_EXEC, + [VM_SHARED | VM_EXEC | VM_READ] = PAGE_READONLY_EXEC, + [VM_SHARED | VM_EXEC | VM_WRITE] = PAGE_SHARED_EXEC, + [VM_SHARED | VM_EXEC | VM_WRITE | VM_READ] = PAGE_SHARED_EXEC +}; + +void add_encrypt_protection_map(void) +{ + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(protection_map); i++) + protection_map[i] = pgprot_encrypted(protection_map[i]); +} + +pgprot_t vm_get_page_prot(unsigned long vm_flags) +{ + unsigned long val = pgprot_val(protection_map[vm_flags & + (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]); + +#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS + /* + * Take the 4 protection key bits out of the vma->vm_flags value and + * turn them in to the bits that we can put in to a pte. + * + * Only override these if Protection Keys are available (which is only + * on 64-bit). + */ + if (vm_flags & VM_PKEY_BIT0) + val |= _PAGE_PKEY_BIT0; + if (vm_flags & VM_PKEY_BIT1) + val |= _PAGE_PKEY_BIT1; + if (vm_flags & VM_PKEY_BIT2) + val |= _PAGE_PKEY_BIT2; + if (vm_flags & VM_PKEY_BIT3) + val |= _PAGE_PKEY_BIT3; +#endif + + val = __sme_set(val); + if (val & _PAGE_PRESENT) + val &= __supported_pte_mask; + return __pgprot(val); +} +EXPORT_SYMBOL(vm_get_page_prot); diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c new file mode 100644 index 0000000000..9deadf517f --- /dev/null +++ b/arch/x86/mm/pgtable.c @@ -0,0 +1,923 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/mm.h> +#include <linux/gfp.h> +#include <linux/hugetlb.h> +#include <asm/pgalloc.h> +#include <asm/tlb.h> +#include <asm/fixmap.h> +#include <asm/mtrr.h> + +#ifdef CONFIG_DYNAMIC_PHYSICAL_MASK +phys_addr_t physical_mask __ro_after_init = (1ULL << __PHYSICAL_MASK_SHIFT) - 1; +EXPORT_SYMBOL(physical_mask); +#endif + +#ifdef CONFIG_HIGHPTE +#define PGTABLE_HIGHMEM __GFP_HIGHMEM +#else +#define PGTABLE_HIGHMEM 0 +#endif + +#ifndef CONFIG_PARAVIRT +static inline +void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table) +{ + tlb_remove_page(tlb, table); +} +#endif + +gfp_t __userpte_alloc_gfp = GFP_PGTABLE_USER | PGTABLE_HIGHMEM; + +pgtable_t pte_alloc_one(struct mm_struct *mm) +{ + return __pte_alloc_one(mm, __userpte_alloc_gfp); +} + +static int __init setup_userpte(char *arg) +{ + if (!arg) + return -EINVAL; + + /* + * "userpte=nohigh" disables allocation of user pagetables in + * high memory. + */ + if (strcmp(arg, "nohigh") == 0) + __userpte_alloc_gfp &= ~__GFP_HIGHMEM; + else + return -EINVAL; + return 0; +} +early_param("userpte", setup_userpte); + +void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte) +{ + pagetable_pte_dtor(page_ptdesc(pte)); + paravirt_release_pte(page_to_pfn(pte)); + paravirt_tlb_remove_table(tlb, pte); +} + +#if CONFIG_PGTABLE_LEVELS > 2 +void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) +{ + struct ptdesc *ptdesc = virt_to_ptdesc(pmd); + paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT); + /* + * NOTE! For PAE, any changes to the top page-directory-pointer-table + * entries need a full cr3 reload to flush. + */ +#ifdef CONFIG_X86_PAE + tlb->need_flush_all = 1; +#endif + pagetable_pmd_dtor(ptdesc); + paravirt_tlb_remove_table(tlb, ptdesc_page(ptdesc)); +} + +#if CONFIG_PGTABLE_LEVELS > 3 +void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud) +{ + paravirt_release_pud(__pa(pud) >> PAGE_SHIFT); + paravirt_tlb_remove_table(tlb, virt_to_page(pud)); +} + +#if CONFIG_PGTABLE_LEVELS > 4 +void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d) +{ + paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT); + paravirt_tlb_remove_table(tlb, virt_to_page(p4d)); +} +#endif /* CONFIG_PGTABLE_LEVELS > 4 */ +#endif /* CONFIG_PGTABLE_LEVELS > 3 */ +#endif /* CONFIG_PGTABLE_LEVELS > 2 */ + +static inline void pgd_list_add(pgd_t *pgd) +{ + struct ptdesc *ptdesc = virt_to_ptdesc(pgd); + + list_add(&ptdesc->pt_list, &pgd_list); +} + +static inline void pgd_list_del(pgd_t *pgd) +{ + struct ptdesc *ptdesc = virt_to_ptdesc(pgd); + + list_del(&ptdesc->pt_list); +} + +#define UNSHARED_PTRS_PER_PGD \ + (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD) +#define MAX_UNSHARED_PTRS_PER_PGD \ + max_t(size_t, KERNEL_PGD_BOUNDARY, PTRS_PER_PGD) + + +static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm) +{ + virt_to_ptdesc(pgd)->pt_mm = mm; +} + +struct mm_struct *pgd_page_get_mm(struct page *page) +{ + return page_ptdesc(page)->pt_mm; +} + +static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd) +{ + /* If the pgd points to a shared pagetable level (either the + ptes in non-PAE, or shared PMD in PAE), then just copy the + references from swapper_pg_dir. */ + if (CONFIG_PGTABLE_LEVELS == 2 || + (CONFIG_PGTABLE_LEVELS == 3 && SHARED_KERNEL_PMD) || + CONFIG_PGTABLE_LEVELS >= 4) { + clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY, + swapper_pg_dir + KERNEL_PGD_BOUNDARY, + KERNEL_PGD_PTRS); + } + + /* list required to sync kernel mapping updates */ + if (!SHARED_KERNEL_PMD) { + pgd_set_mm(pgd, mm); + pgd_list_add(pgd); + } +} + +static void pgd_dtor(pgd_t *pgd) +{ + if (SHARED_KERNEL_PMD) + return; + + spin_lock(&pgd_lock); + pgd_list_del(pgd); + spin_unlock(&pgd_lock); +} + +/* + * List of all pgd's needed for non-PAE so it can invalidate entries + * in both cached and uncached pgd's; not needed for PAE since the + * kernel pmd is shared. If PAE were not to share the pmd a similar + * tactic would be needed. This is essentially codepath-based locking + * against pageattr.c; it is the unique case in which a valid change + * of kernel pagetables can't be lazily synchronized by vmalloc faults. + * vmalloc faults work because attached pagetables are never freed. + * -- nyc + */ + +#ifdef CONFIG_X86_PAE +/* + * In PAE mode, we need to do a cr3 reload (=tlb flush) when + * updating the top-level pagetable entries to guarantee the + * processor notices the update. Since this is expensive, and + * all 4 top-level entries are used almost immediately in a + * new process's life, we just pre-populate them here. + * + * Also, if we're in a paravirt environment where the kernel pmd is + * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate + * and initialize the kernel pmds here. + */ +#define PREALLOCATED_PMDS UNSHARED_PTRS_PER_PGD +#define MAX_PREALLOCATED_PMDS MAX_UNSHARED_PTRS_PER_PGD + +/* + * We allocate separate PMDs for the kernel part of the user page-table + * when PTI is enabled. We need them to map the per-process LDT into the + * user-space page-table. + */ +#define PREALLOCATED_USER_PMDS (boot_cpu_has(X86_FEATURE_PTI) ? \ + KERNEL_PGD_PTRS : 0) +#define MAX_PREALLOCATED_USER_PMDS KERNEL_PGD_PTRS + +void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd) +{ + paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT); + + /* Note: almost everything apart from _PAGE_PRESENT is + reserved at the pmd (PDPT) level. */ + set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT)); + + /* + * According to Intel App note "TLBs, Paging-Structure Caches, + * and Their Invalidation", April 2007, document 317080-001, + * section 8.1: in PAE mode we explicitly have to flush the + * TLB via cr3 if the top-level pgd is changed... + */ + flush_tlb_mm(mm); +} +#else /* !CONFIG_X86_PAE */ + +/* No need to prepopulate any pagetable entries in non-PAE modes. */ +#define PREALLOCATED_PMDS 0 +#define MAX_PREALLOCATED_PMDS 0 +#define PREALLOCATED_USER_PMDS 0 +#define MAX_PREALLOCATED_USER_PMDS 0 +#endif /* CONFIG_X86_PAE */ + +static void free_pmds(struct mm_struct *mm, pmd_t *pmds[], int count) +{ + int i; + struct ptdesc *ptdesc; + + for (i = 0; i < count; i++) + if (pmds[i]) { + ptdesc = virt_to_ptdesc(pmds[i]); + + pagetable_pmd_dtor(ptdesc); + pagetable_free(ptdesc); + mm_dec_nr_pmds(mm); + } +} + +static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[], int count) +{ + int i; + bool failed = false; + gfp_t gfp = GFP_PGTABLE_USER; + + if (mm == &init_mm) + gfp &= ~__GFP_ACCOUNT; + gfp &= ~__GFP_HIGHMEM; + + for (i = 0; i < count; i++) { + pmd_t *pmd = NULL; + struct ptdesc *ptdesc = pagetable_alloc(gfp, 0); + + if (!ptdesc) + failed = true; + if (ptdesc && !pagetable_pmd_ctor(ptdesc)) { + pagetable_free(ptdesc); + ptdesc = NULL; + failed = true; + } + if (ptdesc) { + mm_inc_nr_pmds(mm); + pmd = ptdesc_address(ptdesc); + } + + pmds[i] = pmd; + } + + if (failed) { + free_pmds(mm, pmds, count); + return -ENOMEM; + } + + return 0; +} + +/* + * Mop up any pmd pages which may still be attached to the pgd. + * Normally they will be freed by munmap/exit_mmap, but any pmd we + * preallocate which never got a corresponding vma will need to be + * freed manually. + */ +static void mop_up_one_pmd(struct mm_struct *mm, pgd_t *pgdp) +{ + pgd_t pgd = *pgdp; + + if (pgd_val(pgd) != 0) { + pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd); + + pgd_clear(pgdp); + + paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT); + pmd_free(mm, pmd); + mm_dec_nr_pmds(mm); + } +} + +static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp) +{ + int i; + + for (i = 0; i < PREALLOCATED_PMDS; i++) + mop_up_one_pmd(mm, &pgdp[i]); + +#ifdef CONFIG_PAGE_TABLE_ISOLATION + + if (!boot_cpu_has(X86_FEATURE_PTI)) + return; + + pgdp = kernel_to_user_pgdp(pgdp); + + for (i = 0; i < PREALLOCATED_USER_PMDS; i++) + mop_up_one_pmd(mm, &pgdp[i + KERNEL_PGD_BOUNDARY]); +#endif +} + +static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[]) +{ + p4d_t *p4d; + pud_t *pud; + int i; + + p4d = p4d_offset(pgd, 0); + pud = pud_offset(p4d, 0); + + for (i = 0; i < PREALLOCATED_PMDS; i++, pud++) { + pmd_t *pmd = pmds[i]; + + if (i >= KERNEL_PGD_BOUNDARY) + memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]), + sizeof(pmd_t) * PTRS_PER_PMD); + + pud_populate(mm, pud, pmd); + } +} + +#ifdef CONFIG_PAGE_TABLE_ISOLATION +static void pgd_prepopulate_user_pmd(struct mm_struct *mm, + pgd_t *k_pgd, pmd_t *pmds[]) +{ + pgd_t *s_pgd = kernel_to_user_pgdp(swapper_pg_dir); + pgd_t *u_pgd = kernel_to_user_pgdp(k_pgd); + p4d_t *u_p4d; + pud_t *u_pud; + int i; + + u_p4d = p4d_offset(u_pgd, 0); + u_pud = pud_offset(u_p4d, 0); + + s_pgd += KERNEL_PGD_BOUNDARY; + u_pud += KERNEL_PGD_BOUNDARY; + + for (i = 0; i < PREALLOCATED_USER_PMDS; i++, u_pud++, s_pgd++) { + pmd_t *pmd = pmds[i]; + + memcpy(pmd, (pmd_t *)pgd_page_vaddr(*s_pgd), + sizeof(pmd_t) * PTRS_PER_PMD); + + pud_populate(mm, u_pud, pmd); + } + +} +#else +static void pgd_prepopulate_user_pmd(struct mm_struct *mm, + pgd_t *k_pgd, pmd_t *pmds[]) +{ +} +#endif +/* + * Xen paravirt assumes pgd table should be in one page. 64 bit kernel also + * assumes that pgd should be in one page. + * + * But kernel with PAE paging that is not running as a Xen domain + * only needs to allocate 32 bytes for pgd instead of one page. + */ +#ifdef CONFIG_X86_PAE + +#include <linux/slab.h> + +#define PGD_SIZE (PTRS_PER_PGD * sizeof(pgd_t)) +#define PGD_ALIGN 32 + +static struct kmem_cache *pgd_cache; + +void __init pgtable_cache_init(void) +{ + /* + * When PAE kernel is running as a Xen domain, it does not use + * shared kernel pmd. And this requires a whole page for pgd. + */ + if (!SHARED_KERNEL_PMD) + return; + + /* + * when PAE kernel is not running as a Xen domain, it uses + * shared kernel pmd. Shared kernel pmd does not require a whole + * page for pgd. We are able to just allocate a 32-byte for pgd. + * During boot time, we create a 32-byte slab for pgd table allocation. + */ + pgd_cache = kmem_cache_create("pgd_cache", PGD_SIZE, PGD_ALIGN, + SLAB_PANIC, NULL); +} + +static inline pgd_t *_pgd_alloc(void) +{ + /* + * If no SHARED_KERNEL_PMD, PAE kernel is running as a Xen domain. + * We allocate one page for pgd. + */ + if (!SHARED_KERNEL_PMD) + return (pgd_t *)__get_free_pages(GFP_PGTABLE_USER, + PGD_ALLOCATION_ORDER); + + /* + * Now PAE kernel is not running as a Xen domain. We can allocate + * a 32-byte slab for pgd to save memory space. + */ + return kmem_cache_alloc(pgd_cache, GFP_PGTABLE_USER); +} + +static inline void _pgd_free(pgd_t *pgd) +{ + if (!SHARED_KERNEL_PMD) + free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER); + else + kmem_cache_free(pgd_cache, pgd); +} +#else + +static inline pgd_t *_pgd_alloc(void) +{ + return (pgd_t *)__get_free_pages(GFP_PGTABLE_USER, + PGD_ALLOCATION_ORDER); +} + +static inline void _pgd_free(pgd_t *pgd) +{ + free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER); +} +#endif /* CONFIG_X86_PAE */ + +pgd_t *pgd_alloc(struct mm_struct *mm) +{ + pgd_t *pgd; + pmd_t *u_pmds[MAX_PREALLOCATED_USER_PMDS]; + pmd_t *pmds[MAX_PREALLOCATED_PMDS]; + + pgd = _pgd_alloc(); + + if (pgd == NULL) + goto out; + + mm->pgd = pgd; + + if (sizeof(pmds) != 0 && + preallocate_pmds(mm, pmds, PREALLOCATED_PMDS) != 0) + goto out_free_pgd; + + if (sizeof(u_pmds) != 0 && + preallocate_pmds(mm, u_pmds, PREALLOCATED_USER_PMDS) != 0) + goto out_free_pmds; + + if (paravirt_pgd_alloc(mm) != 0) + goto out_free_user_pmds; + + /* + * Make sure that pre-populating the pmds is atomic with + * respect to anything walking the pgd_list, so that they + * never see a partially populated pgd. + */ + spin_lock(&pgd_lock); + + pgd_ctor(mm, pgd); + if (sizeof(pmds) != 0) + pgd_prepopulate_pmd(mm, pgd, pmds); + + if (sizeof(u_pmds) != 0) + pgd_prepopulate_user_pmd(mm, pgd, u_pmds); + + spin_unlock(&pgd_lock); + + return pgd; + +out_free_user_pmds: + if (sizeof(u_pmds) != 0) + free_pmds(mm, u_pmds, PREALLOCATED_USER_PMDS); +out_free_pmds: + if (sizeof(pmds) != 0) + free_pmds(mm, pmds, PREALLOCATED_PMDS); +out_free_pgd: + _pgd_free(pgd); +out: + return NULL; +} + +void pgd_free(struct mm_struct *mm, pgd_t *pgd) +{ + pgd_mop_up_pmds(mm, pgd); + pgd_dtor(pgd); + paravirt_pgd_free(mm, pgd); + _pgd_free(pgd); +} + +/* + * Used to set accessed or dirty bits in the page table entries + * on other architectures. On x86, the accessed and dirty bits + * are tracked by hardware. However, do_wp_page calls this function + * to also make the pte writeable at the same time the dirty bit is + * set. In that case we do actually need to write the PTE. + */ +int ptep_set_access_flags(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep, + pte_t entry, int dirty) +{ + int changed = !pte_same(*ptep, entry); + + if (changed && dirty) + set_pte(ptep, entry); + + return changed; +} + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +int pmdp_set_access_flags(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp, + pmd_t entry, int dirty) +{ + int changed = !pmd_same(*pmdp, entry); + + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + + if (changed && dirty) { + set_pmd(pmdp, entry); + /* + * We had a write-protection fault here and changed the pmd + * to to more permissive. No need to flush the TLB for that, + * #PF is architecturally guaranteed to do that and in the + * worst-case we'll generate a spurious fault. + */ + } + + return changed; +} + +int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address, + pud_t *pudp, pud_t entry, int dirty) +{ + int changed = !pud_same(*pudp, entry); + + VM_BUG_ON(address & ~HPAGE_PUD_MASK); + + if (changed && dirty) { + set_pud(pudp, entry); + /* + * We had a write-protection fault here and changed the pud + * to to more permissive. No need to flush the TLB for that, + * #PF is architecturally guaranteed to do that and in the + * worst-case we'll generate a spurious fault. + */ + } + + return changed; +} +#endif + +int ptep_test_and_clear_young(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) +{ + int ret = 0; + + if (pte_young(*ptep)) + ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, + (unsigned long *) &ptep->pte); + + return ret; +} + +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) +int pmdp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmdp) +{ + int ret = 0; + + if (pmd_young(*pmdp)) + ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, + (unsigned long *)pmdp); + + return ret; +} +#endif + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +int pudp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long addr, pud_t *pudp) +{ + int ret = 0; + + if (pud_young(*pudp)) + ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, + (unsigned long *)pudp); + + return ret; +} +#endif + +int ptep_clear_flush_young(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep) +{ + /* + * On x86 CPUs, clearing the accessed bit without a TLB flush + * doesn't cause data corruption. [ It could cause incorrect + * page aging and the (mistaken) reclaim of hot pages, but the + * chance of that should be relatively low. ] + * + * So as a performance optimization don't flush the TLB when + * clearing the accessed bit, it will eventually be flushed by + * a context switch or a VM operation anyway. [ In the rare + * event of it not getting flushed for a long time the delay + * shouldn't really matter because there's no real memory + * pressure for swapout to react to. ] + */ + return ptep_test_and_clear_young(vma, address, ptep); +} + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +int pmdp_clear_flush_young(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp) +{ + int young; + + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + + young = pmdp_test_and_clear_young(vma, address, pmdp); + if (young) + flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); + + return young; +} + +pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmdp) +{ + /* + * No flush is necessary. Once an invalid PTE is established, the PTE's + * access and dirty bits cannot be updated. + */ + return pmdp_establish(vma, address, pmdp, pmd_mkinvalid(*pmdp)); +} +#endif + +/** + * reserve_top_address - reserves a hole in the top of kernel address space + * @reserve - size of hole to reserve + * + * Can be used to relocate the fixmap area and poke a hole in the top + * of kernel address space to make room for a hypervisor. + */ +void __init reserve_top_address(unsigned long reserve) +{ +#ifdef CONFIG_X86_32 + BUG_ON(fixmaps_set > 0); + __FIXADDR_TOP = round_down(-reserve, 1 << PMD_SHIFT) - PAGE_SIZE; + printk(KERN_INFO "Reserving virtual address space above 0x%08lx (rounded to 0x%08lx)\n", + -reserve, __FIXADDR_TOP + PAGE_SIZE); +#endif +} + +int fixmaps_set; + +void __native_set_fixmap(enum fixed_addresses idx, pte_t pte) +{ + unsigned long address = __fix_to_virt(idx); + +#ifdef CONFIG_X86_64 + /* + * Ensure that the static initial page tables are covering the + * fixmap completely. + */ + BUILD_BUG_ON(__end_of_permanent_fixed_addresses > + (FIXMAP_PMD_NUM * PTRS_PER_PTE)); +#endif + + if (idx >= __end_of_fixed_addresses) { + BUG(); + return; + } + set_pte_vaddr(address, pte); + fixmaps_set++; +} + +void native_set_fixmap(unsigned /* enum fixed_addresses */ idx, + phys_addr_t phys, pgprot_t flags) +{ + /* Sanitize 'prot' against any unsupported bits: */ + pgprot_val(flags) &= __default_kernel_pte_mask; + + __native_set_fixmap(idx, pfn_pte(phys >> PAGE_SHIFT, flags)); +} + +#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP +#ifdef CONFIG_X86_5LEVEL +/** + * p4d_set_huge - setup kernel P4D mapping + * + * No 512GB pages yet -- always return 0 + */ +int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot) +{ + return 0; +} + +/** + * p4d_clear_huge - clear kernel P4D mapping when it is set + * + * No 512GB pages yet -- always return 0 + */ +void p4d_clear_huge(p4d_t *p4d) +{ +} +#endif + +/** + * pud_set_huge - setup kernel PUD mapping + * + * MTRRs can override PAT memory types with 4KiB granularity. Therefore, this + * function sets up a huge page only if the complete range has the same MTRR + * caching mode. + * + * Callers should try to decrease page size (1GB -> 2MB -> 4K) if the bigger + * page mapping attempt fails. + * + * Returns 1 on success and 0 on failure. + */ +int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot) +{ + u8 uniform; + + mtrr_type_lookup(addr, addr + PUD_SIZE, &uniform); + if (!uniform) + return 0; + + /* Bail out if we are we on a populated non-leaf entry: */ + if (pud_present(*pud) && !pud_huge(*pud)) + return 0; + + set_pte((pte_t *)pud, pfn_pte( + (u64)addr >> PAGE_SHIFT, + __pgprot(protval_4k_2_large(pgprot_val(prot)) | _PAGE_PSE))); + + return 1; +} + +/** + * pmd_set_huge - setup kernel PMD mapping + * + * See text over pud_set_huge() above. + * + * Returns 1 on success and 0 on failure. + */ +int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot) +{ + u8 uniform; + + mtrr_type_lookup(addr, addr + PMD_SIZE, &uniform); + if (!uniform) { + pr_warn_once("%s: Cannot satisfy [mem %#010llx-%#010llx] with a huge-page mapping due to MTRR override.\n", + __func__, addr, addr + PMD_SIZE); + return 0; + } + + /* Bail out if we are we on a populated non-leaf entry: */ + if (pmd_present(*pmd) && !pmd_huge(*pmd)) + return 0; + + set_pte((pte_t *)pmd, pfn_pte( + (u64)addr >> PAGE_SHIFT, + __pgprot(protval_4k_2_large(pgprot_val(prot)) | _PAGE_PSE))); + + return 1; +} + +/** + * pud_clear_huge - clear kernel PUD mapping when it is set + * + * Returns 1 on success and 0 on failure (no PUD map is found). + */ +int pud_clear_huge(pud_t *pud) +{ + if (pud_large(*pud)) { + pud_clear(pud); + return 1; + } + + return 0; +} + +/** + * pmd_clear_huge - clear kernel PMD mapping when it is set + * + * Returns 1 on success and 0 on failure (no PMD map is found). + */ +int pmd_clear_huge(pmd_t *pmd) +{ + if (pmd_large(*pmd)) { + pmd_clear(pmd); + return 1; + } + + return 0; +} + +#ifdef CONFIG_X86_64 +/** + * pud_free_pmd_page - Clear pud entry and free pmd page. + * @pud: Pointer to a PUD. + * @addr: Virtual address associated with pud. + * + * Context: The pud range has been unmapped and TLB purged. + * Return: 1 if clearing the entry succeeded. 0 otherwise. + * + * NOTE: Callers must allow a single page allocation. + */ +int pud_free_pmd_page(pud_t *pud, unsigned long addr) +{ + pmd_t *pmd, *pmd_sv; + pte_t *pte; + int i; + + pmd = pud_pgtable(*pud); + pmd_sv = (pmd_t *)__get_free_page(GFP_KERNEL); + if (!pmd_sv) + return 0; + + for (i = 0; i < PTRS_PER_PMD; i++) { + pmd_sv[i] = pmd[i]; + if (!pmd_none(pmd[i])) + pmd_clear(&pmd[i]); + } + + pud_clear(pud); + + /* INVLPG to clear all paging-structure caches */ + flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1); + + for (i = 0; i < PTRS_PER_PMD; i++) { + if (!pmd_none(pmd_sv[i])) { + pte = (pte_t *)pmd_page_vaddr(pmd_sv[i]); + free_page((unsigned long)pte); + } + } + + free_page((unsigned long)pmd_sv); + + pagetable_pmd_dtor(virt_to_ptdesc(pmd)); + free_page((unsigned long)pmd); + + return 1; +} + +/** + * pmd_free_pte_page - Clear pmd entry and free pte page. + * @pmd: Pointer to a PMD. + * @addr: Virtual address associated with pmd. + * + * Context: The pmd range has been unmapped and TLB purged. + * Return: 1 if clearing the entry succeeded. 0 otherwise. + */ +int pmd_free_pte_page(pmd_t *pmd, unsigned long addr) +{ + pte_t *pte; + + pte = (pte_t *)pmd_page_vaddr(*pmd); + pmd_clear(pmd); + + /* INVLPG to clear all paging-structure caches */ + flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1); + + free_page((unsigned long)pte); + + return 1; +} + +#else /* !CONFIG_X86_64 */ + +/* + * Disable free page handling on x86-PAE. This assures that ioremap() + * does not update sync'd pmd entries. See vmalloc_sync_one(). + */ +int pmd_free_pte_page(pmd_t *pmd, unsigned long addr) +{ + return pmd_none(*pmd); +} + +#endif /* CONFIG_X86_64 */ +#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */ + +pte_t pte_mkwrite(pte_t pte, struct vm_area_struct *vma) +{ + if (vma->vm_flags & VM_SHADOW_STACK) + return pte_mkwrite_shstk(pte); + + pte = pte_mkwrite_novma(pte); + + return pte_clear_saveddirty(pte); +} + +pmd_t pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) +{ + if (vma->vm_flags & VM_SHADOW_STACK) + return pmd_mkwrite_shstk(pmd); + + pmd = pmd_mkwrite_novma(pmd); + + return pmd_clear_saveddirty(pmd); +} + +void arch_check_zapped_pte(struct vm_area_struct *vma, pte_t pte) +{ + /* + * Hardware before shadow stack can (rarely) set Dirty=1 + * on a Write=0 PTE. So the below condition + * only indicates a software bug when shadow stack is + * supported by the HW. This checking is covered in + * pte_shstk(). + */ + VM_WARN_ON_ONCE(!(vma->vm_flags & VM_SHADOW_STACK) && + pte_shstk(pte)); +} + +void arch_check_zapped_pmd(struct vm_area_struct *vma, pmd_t pmd) +{ + /* See note in arch_check_zapped_pte() */ + VM_WARN_ON_ONCE(!(vma->vm_flags & VM_SHADOW_STACK) && + pmd_shstk(pmd)); +} diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c new file mode 100644 index 0000000000..c234634e26 --- /dev/null +++ b/arch/x86/mm/pgtable_32.c @@ -0,0 +1,104 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/mm.h> +#include <linux/nmi.h> +#include <linux/swap.h> +#include <linux/smp.h> +#include <linux/highmem.h> +#include <linux/pagemap.h> +#include <linux/spinlock.h> + +#include <asm/cpu_entry_area.h> +#include <asm/fixmap.h> +#include <asm/e820/api.h> +#include <asm/tlb.h> +#include <asm/tlbflush.h> +#include <asm/io.h> +#include <linux/vmalloc.h> + +unsigned int __VMALLOC_RESERVE = 128 << 20; + +/* + * Associate a virtual page frame with a given physical page frame + * and protection flags for that frame. + */ +void set_pte_vaddr(unsigned long vaddr, pte_t pteval) +{ + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + pgd = swapper_pg_dir + pgd_index(vaddr); + if (pgd_none(*pgd)) { + BUG(); + return; + } + p4d = p4d_offset(pgd, vaddr); + if (p4d_none(*p4d)) { + BUG(); + return; + } + pud = pud_offset(p4d, vaddr); + if (pud_none(*pud)) { + BUG(); + return; + } + pmd = pmd_offset(pud, vaddr); + if (pmd_none(*pmd)) { + BUG(); + return; + } + pte = pte_offset_kernel(pmd, vaddr); + if (!pte_none(pteval)) + set_pte_at(&init_mm, vaddr, pte, pteval); + else + pte_clear(&init_mm, vaddr, pte); + + /* + * It's enough to flush this one mapping. + * (PGE mappings get flushed as well) + */ + flush_tlb_one_kernel(vaddr); +} + +unsigned long __FIXADDR_TOP = 0xfffff000; +EXPORT_SYMBOL(__FIXADDR_TOP); + +/* + * vmalloc=size forces the vmalloc area to be exactly 'size' + * bytes. This can be used to increase (or decrease) the + * vmalloc area - the default is 128m. + */ +static int __init parse_vmalloc(char *arg) +{ + if (!arg) + return -EINVAL; + + /* Add VMALLOC_OFFSET to the parsed value due to vm area guard hole*/ + __VMALLOC_RESERVE = memparse(arg, &arg) + VMALLOC_OFFSET; + return 0; +} +early_param("vmalloc", parse_vmalloc); + +/* + * reservetop=size reserves a hole at the top of the kernel address space which + * a hypervisor can load into later. Needed for dynamically loaded hypervisors, + * so relocating the fixmap can be done before paging initialization. + */ +static int __init parse_reservetop(char *arg) +{ + unsigned long address; + + if (!arg) + return -EINVAL; + + address = memparse(arg, &arg); + reserve_top_address(address); + early_ioremap_init(); + return 0; +} +early_param("reservetop", parse_reservetop); diff --git a/arch/x86/mm/physaddr.c b/arch/x86/mm/physaddr.c new file mode 100644 index 0000000000..fc3f3d3e2e --- /dev/null +++ b/arch/x86/mm/physaddr.c @@ -0,0 +1,100 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/memblock.h> +#include <linux/mmdebug.h> +#include <linux/export.h> +#include <linux/mm.h> + +#include <asm/page.h> +#include <linux/vmalloc.h> + +#include "physaddr.h" + +#ifdef CONFIG_X86_64 + +#ifdef CONFIG_DEBUG_VIRTUAL +unsigned long __phys_addr(unsigned long x) +{ + unsigned long y = x - __START_KERNEL_map; + + /* use the carry flag to determine if x was < __START_KERNEL_map */ + if (unlikely(x > y)) { + x = y + phys_base; + + VIRTUAL_BUG_ON(y >= KERNEL_IMAGE_SIZE); + } else { + x = y + (__START_KERNEL_map - PAGE_OFFSET); + + /* carry flag will be set if starting x was >= PAGE_OFFSET */ + VIRTUAL_BUG_ON((x > y) || !phys_addr_valid(x)); + } + + return x; +} +EXPORT_SYMBOL(__phys_addr); + +unsigned long __phys_addr_symbol(unsigned long x) +{ + unsigned long y = x - __START_KERNEL_map; + + /* only check upper bounds since lower bounds will trigger carry */ + VIRTUAL_BUG_ON(y >= KERNEL_IMAGE_SIZE); + + return y + phys_base; +} +EXPORT_SYMBOL(__phys_addr_symbol); +#endif + +bool __virt_addr_valid(unsigned long x) +{ + unsigned long y = x - __START_KERNEL_map; + + /* use the carry flag to determine if x was < __START_KERNEL_map */ + if (unlikely(x > y)) { + x = y + phys_base; + + if (y >= KERNEL_IMAGE_SIZE) + return false; + } else { + x = y + (__START_KERNEL_map - PAGE_OFFSET); + + /* carry flag will be set if starting x was >= PAGE_OFFSET */ + if ((x > y) || !phys_addr_valid(x)) + return false; + } + + return pfn_valid(x >> PAGE_SHIFT); +} +EXPORT_SYMBOL(__virt_addr_valid); + +#else + +#ifdef CONFIG_DEBUG_VIRTUAL +unsigned long __phys_addr(unsigned long x) +{ + unsigned long phys_addr = x - PAGE_OFFSET; + /* VMALLOC_* aren't constants */ + VIRTUAL_BUG_ON(x < PAGE_OFFSET); + VIRTUAL_BUG_ON(__vmalloc_start_set && is_vmalloc_addr((void *) x)); + /* max_low_pfn is set early, but not _that_ early */ + if (max_low_pfn) { + VIRTUAL_BUG_ON((phys_addr >> PAGE_SHIFT) > max_low_pfn); + BUG_ON(slow_virt_to_phys((void *)x) != phys_addr); + } + return phys_addr; +} +EXPORT_SYMBOL(__phys_addr); +#endif + +bool __virt_addr_valid(unsigned long x) +{ + if (x < PAGE_OFFSET) + return false; + if (__vmalloc_start_set && is_vmalloc_addr((void *) x)) + return false; + if (x >= FIXADDR_START) + return false; + return pfn_valid((x - PAGE_OFFSET) >> PAGE_SHIFT); +} +EXPORT_SYMBOL(__virt_addr_valid); + +#endif /* CONFIG_X86_64 */ diff --git a/arch/x86/mm/physaddr.h b/arch/x86/mm/physaddr.h new file mode 100644 index 0000000000..9f6419cafc --- /dev/null +++ b/arch/x86/mm/physaddr.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include <asm/processor.h> + +static inline int phys_addr_valid(resource_size_t addr) +{ +#ifdef CONFIG_PHYS_ADDR_T_64BIT + return !(addr >> boot_cpu_data.x86_phys_bits); +#else + return 1; +#endif +} diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c new file mode 100644 index 0000000000..7418c367e3 --- /dev/null +++ b/arch/x86/mm/pkeys.c @@ -0,0 +1,197 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Intel Memory Protection Keys management + * Copyright (c) 2015, Intel Corporation. + */ +#include <linux/debugfs.h> /* debugfs_create_u32() */ +#include <linux/mm_types.h> /* mm_struct, vma, etc... */ +#include <linux/pkeys.h> /* PKEY_* */ +#include <uapi/asm-generic/mman-common.h> + +#include <asm/cpufeature.h> /* boot_cpu_has, ... */ +#include <asm/mmu_context.h> /* vma_pkey() */ + +int __execute_only_pkey(struct mm_struct *mm) +{ + bool need_to_set_mm_pkey = false; + int execute_only_pkey = mm->context.execute_only_pkey; + int ret; + + /* Do we need to assign a pkey for mm's execute-only maps? */ + if (execute_only_pkey == -1) { + /* Go allocate one to use, which might fail */ + execute_only_pkey = mm_pkey_alloc(mm); + if (execute_only_pkey < 0) + return -1; + need_to_set_mm_pkey = true; + } + + /* + * We do not want to go through the relatively costly + * dance to set PKRU if we do not need to. Check it + * first and assume that if the execute-only pkey is + * write-disabled that we do not have to set it + * ourselves. + */ + if (!need_to_set_mm_pkey && + !__pkru_allows_read(read_pkru(), execute_only_pkey)) { + return execute_only_pkey; + } + + /* + * Set up PKRU so that it denies access for everything + * other than execution. + */ + ret = arch_set_user_pkey_access(current, execute_only_pkey, + PKEY_DISABLE_ACCESS); + /* + * If the PKRU-set operation failed somehow, just return + * 0 and effectively disable execute-only support. + */ + if (ret) { + mm_set_pkey_free(mm, execute_only_pkey); + return -1; + } + + /* We got one, store it and use it from here on out */ + if (need_to_set_mm_pkey) + mm->context.execute_only_pkey = execute_only_pkey; + return execute_only_pkey; +} + +static inline bool vma_is_pkey_exec_only(struct vm_area_struct *vma) +{ + /* Do this check first since the vm_flags should be hot */ + if ((vma->vm_flags & VM_ACCESS_FLAGS) != VM_EXEC) + return false; + if (vma_pkey(vma) != vma->vm_mm->context.execute_only_pkey) + return false; + + return true; +} + +/* + * This is only called for *plain* mprotect calls. + */ +int __arch_override_mprotect_pkey(struct vm_area_struct *vma, int prot, int pkey) +{ + /* + * Is this an mprotect_pkey() call? If so, never + * override the value that came from the user. + */ + if (pkey != -1) + return pkey; + + /* + * The mapping is execute-only. Go try to get the + * execute-only protection key. If we fail to do that, + * fall through as if we do not have execute-only + * support in this mm. + */ + if (prot == PROT_EXEC) { + pkey = execute_only_pkey(vma->vm_mm); + if (pkey > 0) + return pkey; + } else if (vma_is_pkey_exec_only(vma)) { + /* + * Protections are *not* PROT_EXEC, but the mapping + * is using the exec-only pkey. This mapping was + * PROT_EXEC and will no longer be. Move back to + * the default pkey. + */ + return ARCH_DEFAULT_PKEY; + } + + /* + * This is a vanilla, non-pkey mprotect (or we failed to + * setup execute-only), inherit the pkey from the VMA we + * are working on. + */ + return vma_pkey(vma); +} + +#define PKRU_AD_MASK(pkey) (PKRU_AD_BIT << ((pkey) * PKRU_BITS_PER_PKEY)) + +/* + * Make the default PKRU value (at execve() time) as restrictive + * as possible. This ensures that any threads clone()'d early + * in the process's lifetime will not accidentally get access + * to data which is pkey-protected later on. + */ +u32 init_pkru_value = PKRU_AD_MASK( 1) | PKRU_AD_MASK( 2) | + PKRU_AD_MASK( 3) | PKRU_AD_MASK( 4) | + PKRU_AD_MASK( 5) | PKRU_AD_MASK( 6) | + PKRU_AD_MASK( 7) | PKRU_AD_MASK( 8) | + PKRU_AD_MASK( 9) | PKRU_AD_MASK(10) | + PKRU_AD_MASK(11) | PKRU_AD_MASK(12) | + PKRU_AD_MASK(13) | PKRU_AD_MASK(14) | + PKRU_AD_MASK(15); + +static ssize_t init_pkru_read_file(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos) +{ + char buf[32]; + unsigned int len; + + len = sprintf(buf, "0x%x\n", init_pkru_value); + return simple_read_from_buffer(user_buf, count, ppos, buf, len); +} + +static ssize_t init_pkru_write_file(struct file *file, + const char __user *user_buf, size_t count, loff_t *ppos) +{ + char buf[32]; + ssize_t len; + u32 new_init_pkru; + + len = min(count, sizeof(buf) - 1); + if (copy_from_user(buf, user_buf, len)) + return -EFAULT; + + /* Make the buffer a valid string that we can not overrun */ + buf[len] = '\0'; + if (kstrtouint(buf, 0, &new_init_pkru)) + return -EINVAL; + + /* + * Don't allow insane settings that will blow the system + * up immediately if someone attempts to disable access + * or writes to pkey 0. + */ + if (new_init_pkru & (PKRU_AD_BIT|PKRU_WD_BIT)) + return -EINVAL; + + WRITE_ONCE(init_pkru_value, new_init_pkru); + return count; +} + +static const struct file_operations fops_init_pkru = { + .read = init_pkru_read_file, + .write = init_pkru_write_file, + .llseek = default_llseek, +}; + +static int __init create_init_pkru_value(void) +{ + /* Do not expose the file if pkeys are not supported. */ + if (!cpu_feature_enabled(X86_FEATURE_OSPKE)) + return 0; + + debugfs_create_file("init_pkru", S_IRUSR | S_IWUSR, + arch_debugfs_dir, NULL, &fops_init_pkru); + return 0; +} +late_initcall(create_init_pkru_value); + +static __init int setup_init_pkru(char *opt) +{ + u32 new_init_pkru; + + if (kstrtouint(opt, 0, &new_init_pkru)) + return 1; + + WRITE_ONCE(init_pkru_value, new_init_pkru); + + return 1; +} +__setup("init_pkru=", setup_init_pkru); diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c new file mode 100644 index 0000000000..78414c6d1b --- /dev/null +++ b/arch/x86/mm/pti.c @@ -0,0 +1,666 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright(c) 2017 Intel Corporation. All rights reserved. + * + * This code is based in part on work published here: + * + * https://github.com/IAIK/KAISER + * + * The original work was written by and and signed off by for the Linux + * kernel by: + * + * Signed-off-by: Richard Fellner <richard.fellner@student.tugraz.at> + * Signed-off-by: Moritz Lipp <moritz.lipp@iaik.tugraz.at> + * Signed-off-by: Daniel Gruss <daniel.gruss@iaik.tugraz.at> + * Signed-off-by: Michael Schwarz <michael.schwarz@iaik.tugraz.at> + * + * Major changes to the original code by: Dave Hansen <dave.hansen@intel.com> + * Mostly rewritten by Thomas Gleixner <tglx@linutronix.de> and + * Andy Lutomirsky <luto@amacapital.net> + */ +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/string.h> +#include <linux/types.h> +#include <linux/bug.h> +#include <linux/init.h> +#include <linux/spinlock.h> +#include <linux/mm.h> +#include <linux/uaccess.h> +#include <linux/cpu.h> + +#include <asm/cpufeature.h> +#include <asm/hypervisor.h> +#include <asm/vsyscall.h> +#include <asm/cmdline.h> +#include <asm/pti.h> +#include <asm/tlbflush.h> +#include <asm/desc.h> +#include <asm/sections.h> +#include <asm/set_memory.h> + +#undef pr_fmt +#define pr_fmt(fmt) "Kernel/User page tables isolation: " fmt + +/* Backporting helper */ +#ifndef __GFP_NOTRACK +#define __GFP_NOTRACK 0 +#endif + +/* + * Define the page-table levels we clone for user-space on 32 + * and 64 bit. + */ +#ifdef CONFIG_X86_64 +#define PTI_LEVEL_KERNEL_IMAGE PTI_CLONE_PMD +#else +#define PTI_LEVEL_KERNEL_IMAGE PTI_CLONE_PTE +#endif + +static void __init pti_print_if_insecure(const char *reason) +{ + if (boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN)) + pr_info("%s\n", reason); +} + +static void __init pti_print_if_secure(const char *reason) +{ + if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN)) + pr_info("%s\n", reason); +} + +static enum pti_mode { + PTI_AUTO = 0, + PTI_FORCE_OFF, + PTI_FORCE_ON +} pti_mode; + +void __init pti_check_boottime_disable(void) +{ + char arg[5]; + int ret; + + /* Assume mode is auto unless overridden. */ + pti_mode = PTI_AUTO; + + if (hypervisor_is_type(X86_HYPER_XEN_PV)) { + pti_mode = PTI_FORCE_OFF; + pti_print_if_insecure("disabled on XEN PV."); + return; + } + + ret = cmdline_find_option(boot_command_line, "pti", arg, sizeof(arg)); + if (ret > 0) { + if (ret == 3 && !strncmp(arg, "off", 3)) { + pti_mode = PTI_FORCE_OFF; + pti_print_if_insecure("disabled on command line."); + return; + } + if (ret == 2 && !strncmp(arg, "on", 2)) { + pti_mode = PTI_FORCE_ON; + pti_print_if_secure("force enabled on command line."); + goto enable; + } + if (ret == 4 && !strncmp(arg, "auto", 4)) { + pti_mode = PTI_AUTO; + goto autosel; + } + } + + if (cmdline_find_option_bool(boot_command_line, "nopti") || + cpu_mitigations_off()) { + pti_mode = PTI_FORCE_OFF; + pti_print_if_insecure("disabled on command line."); + return; + } + +autosel: + if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN)) + return; +enable: + setup_force_cpu_cap(X86_FEATURE_PTI); +} + +pgd_t __pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd) +{ + /* + * Changes to the high (kernel) portion of the kernelmode page + * tables are not automatically propagated to the usermode tables. + * + * Users should keep in mind that, unlike the kernelmode tables, + * there is no vmalloc_fault equivalent for the usermode tables. + * Top-level entries added to init_mm's usermode pgd after boot + * will not be automatically propagated to other mms. + */ + if (!pgdp_maps_userspace(pgdp)) + return pgd; + + /* + * The user page tables get the full PGD, accessible from + * userspace: + */ + kernel_to_user_pgdp(pgdp)->pgd = pgd.pgd; + + /* + * If this is normal user memory, make it NX in the kernel + * pagetables so that, if we somehow screw up and return to + * usermode with the kernel CR3 loaded, we'll get a page fault + * instead of allowing user code to execute with the wrong CR3. + * + * As exceptions, we don't set NX if: + * - _PAGE_USER is not set. This could be an executable + * EFI runtime mapping or something similar, and the kernel + * may execute from it + * - we don't have NX support + * - we're clearing the PGD (i.e. the new pgd is not present). + */ + if ((pgd.pgd & (_PAGE_USER|_PAGE_PRESENT)) == (_PAGE_USER|_PAGE_PRESENT) && + (__supported_pte_mask & _PAGE_NX)) + pgd.pgd |= _PAGE_NX; + + /* return the copy of the PGD we want the kernel to use: */ + return pgd; +} + +/* + * Walk the user copy of the page tables (optionally) trying to allocate + * page table pages on the way down. + * + * Returns a pointer to a P4D on success, or NULL on failure. + */ +static p4d_t *pti_user_pagetable_walk_p4d(unsigned long address) +{ + pgd_t *pgd = kernel_to_user_pgdp(pgd_offset_k(address)); + gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); + + if (address < PAGE_OFFSET) { + WARN_ONCE(1, "attempt to walk user address\n"); + return NULL; + } + + if (pgd_none(*pgd)) { + unsigned long new_p4d_page = __get_free_page(gfp); + if (WARN_ON_ONCE(!new_p4d_page)) + return NULL; + + set_pgd(pgd, __pgd(_KERNPG_TABLE | __pa(new_p4d_page))); + } + BUILD_BUG_ON(pgd_large(*pgd) != 0); + + return p4d_offset(pgd, address); +} + +/* + * Walk the user copy of the page tables (optionally) trying to allocate + * page table pages on the way down. + * + * Returns a pointer to a PMD on success, or NULL on failure. + */ +static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address) +{ + gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); + p4d_t *p4d; + pud_t *pud; + + p4d = pti_user_pagetable_walk_p4d(address); + if (!p4d) + return NULL; + + BUILD_BUG_ON(p4d_large(*p4d) != 0); + if (p4d_none(*p4d)) { + unsigned long new_pud_page = __get_free_page(gfp); + if (WARN_ON_ONCE(!new_pud_page)) + return NULL; + + set_p4d(p4d, __p4d(_KERNPG_TABLE | __pa(new_pud_page))); + } + + pud = pud_offset(p4d, address); + /* The user page tables do not use large mappings: */ + if (pud_large(*pud)) { + WARN_ON(1); + return NULL; + } + if (pud_none(*pud)) { + unsigned long new_pmd_page = __get_free_page(gfp); + if (WARN_ON_ONCE(!new_pmd_page)) + return NULL; + + set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page))); + } + + return pmd_offset(pud, address); +} + +/* + * Walk the shadow copy of the page tables (optionally) trying to allocate + * page table pages on the way down. Does not support large pages. + * + * Note: this is only used when mapping *new* kernel data into the + * user/shadow page tables. It is never used for userspace data. + * + * Returns a pointer to a PTE on success, or NULL on failure. + */ +static pte_t *pti_user_pagetable_walk_pte(unsigned long address) +{ + gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); + pmd_t *pmd; + pte_t *pte; + + pmd = pti_user_pagetable_walk_pmd(address); + if (!pmd) + return NULL; + + /* We can't do anything sensible if we hit a large mapping. */ + if (pmd_large(*pmd)) { + WARN_ON(1); + return NULL; + } + + if (pmd_none(*pmd)) { + unsigned long new_pte_page = __get_free_page(gfp); + if (!new_pte_page) + return NULL; + + set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page))); + } + + pte = pte_offset_kernel(pmd, address); + if (pte_flags(*pte) & _PAGE_USER) { + WARN_ONCE(1, "attempt to walk to user pte\n"); + return NULL; + } + return pte; +} + +#ifdef CONFIG_X86_VSYSCALL_EMULATION +static void __init pti_setup_vsyscall(void) +{ + pte_t *pte, *target_pte; + unsigned int level; + + pte = lookup_address(VSYSCALL_ADDR, &level); + if (!pte || WARN_ON(level != PG_LEVEL_4K) || pte_none(*pte)) + return; + + target_pte = pti_user_pagetable_walk_pte(VSYSCALL_ADDR); + if (WARN_ON(!target_pte)) + return; + + *target_pte = *pte; + set_vsyscall_pgtable_user_bits(kernel_to_user_pgdp(swapper_pg_dir)); +} +#else +static void __init pti_setup_vsyscall(void) { } +#endif + +enum pti_clone_level { + PTI_CLONE_PMD, + PTI_CLONE_PTE, +}; + +static void +pti_clone_pgtable(unsigned long start, unsigned long end, + enum pti_clone_level level) +{ + unsigned long addr; + + /* + * Clone the populated PMDs which cover start to end. These PMD areas + * can have holes. + */ + for (addr = start; addr < end;) { + pte_t *pte, *target_pte; + pmd_t *pmd, *target_pmd; + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + + /* Overflow check */ + if (addr < start) + break; + + pgd = pgd_offset_k(addr); + if (WARN_ON(pgd_none(*pgd))) + return; + p4d = p4d_offset(pgd, addr); + if (WARN_ON(p4d_none(*p4d))) + return; + + pud = pud_offset(p4d, addr); + if (pud_none(*pud)) { + WARN_ON_ONCE(addr & ~PUD_MASK); + addr = round_up(addr + 1, PUD_SIZE); + continue; + } + + pmd = pmd_offset(pud, addr); + if (pmd_none(*pmd)) { + WARN_ON_ONCE(addr & ~PMD_MASK); + addr = round_up(addr + 1, PMD_SIZE); + continue; + } + + if (pmd_large(*pmd) || level == PTI_CLONE_PMD) { + target_pmd = pti_user_pagetable_walk_pmd(addr); + if (WARN_ON(!target_pmd)) + return; + + /* + * Only clone present PMDs. This ensures only setting + * _PAGE_GLOBAL on present PMDs. This should only be + * called on well-known addresses anyway, so a non- + * present PMD would be a surprise. + */ + if (WARN_ON(!(pmd_flags(*pmd) & _PAGE_PRESENT))) + return; + + /* + * Setting 'target_pmd' below creates a mapping in both + * the user and kernel page tables. It is effectively + * global, so set it as global in both copies. Note: + * the X86_FEATURE_PGE check is not _required_ because + * the CPU ignores _PAGE_GLOBAL when PGE is not + * supported. The check keeps consistency with + * code that only set this bit when supported. + */ + if (boot_cpu_has(X86_FEATURE_PGE)) + *pmd = pmd_set_flags(*pmd, _PAGE_GLOBAL); + + /* + * Copy the PMD. That is, the kernelmode and usermode + * tables will share the last-level page tables of this + * address range + */ + *target_pmd = *pmd; + + addr += PMD_SIZE; + + } else if (level == PTI_CLONE_PTE) { + + /* Walk the page-table down to the pte level */ + pte = pte_offset_kernel(pmd, addr); + if (pte_none(*pte)) { + addr += PAGE_SIZE; + continue; + } + + /* Only clone present PTEs */ + if (WARN_ON(!(pte_flags(*pte) & _PAGE_PRESENT))) + return; + + /* Allocate PTE in the user page-table */ + target_pte = pti_user_pagetable_walk_pte(addr); + if (WARN_ON(!target_pte)) + return; + + /* Set GLOBAL bit in both PTEs */ + if (boot_cpu_has(X86_FEATURE_PGE)) + *pte = pte_set_flags(*pte, _PAGE_GLOBAL); + + /* Clone the PTE */ + *target_pte = *pte; + + addr += PAGE_SIZE; + + } else { + BUG(); + } + } +} + +#ifdef CONFIG_X86_64 +/* + * Clone a single p4d (i.e. a top-level entry on 4-level systems and a + * next-level entry on 5-level systems. + */ +static void __init pti_clone_p4d(unsigned long addr) +{ + p4d_t *kernel_p4d, *user_p4d; + pgd_t *kernel_pgd; + + user_p4d = pti_user_pagetable_walk_p4d(addr); + if (!user_p4d) + return; + + kernel_pgd = pgd_offset_k(addr); + kernel_p4d = p4d_offset(kernel_pgd, addr); + *user_p4d = *kernel_p4d; +} + +/* + * Clone the CPU_ENTRY_AREA and associated data into the user space visible + * page table. + */ +static void __init pti_clone_user_shared(void) +{ + unsigned int cpu; + + pti_clone_p4d(CPU_ENTRY_AREA_BASE); + + for_each_possible_cpu(cpu) { + /* + * The SYSCALL64 entry code needs one word of scratch space + * in which to spill a register. It lives in the sp2 slot + * of the CPU's TSS. + * + * This is done for all possible CPUs during boot to ensure + * that it's propagated to all mms. + */ + + unsigned long va = (unsigned long)&per_cpu(cpu_tss_rw, cpu); + phys_addr_t pa = per_cpu_ptr_to_phys((void *)va); + pte_t *target_pte; + + target_pte = pti_user_pagetable_walk_pte(va); + if (WARN_ON(!target_pte)) + return; + + *target_pte = pfn_pte(pa >> PAGE_SHIFT, PAGE_KERNEL); + } +} + +#else /* CONFIG_X86_64 */ + +/* + * On 32 bit PAE systems with 1GB of Kernel address space there is only + * one pgd/p4d for the whole kernel. Cloning that would map the whole + * address space into the user page-tables, making PTI useless. So clone + * the page-table on the PMD level to prevent that. + */ +static void __init pti_clone_user_shared(void) +{ + unsigned long start, end; + + start = CPU_ENTRY_AREA_BASE; + end = start + (PAGE_SIZE * CPU_ENTRY_AREA_PAGES); + + pti_clone_pgtable(start, end, PTI_CLONE_PMD); +} +#endif /* CONFIG_X86_64 */ + +/* + * Clone the ESPFIX P4D into the user space visible page table + */ +static void __init pti_setup_espfix64(void) +{ +#ifdef CONFIG_X86_ESPFIX64 + pti_clone_p4d(ESPFIX_BASE_ADDR); +#endif +} + +/* + * Clone the populated PMDs of the entry text and force it RO. + */ +static void pti_clone_entry_text(void) +{ + pti_clone_pgtable((unsigned long) __entry_text_start, + (unsigned long) __entry_text_end, + PTI_CLONE_PMD); +} + +/* + * Global pages and PCIDs are both ways to make kernel TLB entries + * live longer, reduce TLB misses and improve kernel performance. + * But, leaving all kernel text Global makes it potentially accessible + * to Meltdown-style attacks which make it trivial to find gadgets or + * defeat KASLR. + * + * Only use global pages when it is really worth it. + */ +static inline bool pti_kernel_image_global_ok(void) +{ + /* + * Systems with PCIDs get little benefit from global + * kernel text and are not worth the downsides. + */ + if (cpu_feature_enabled(X86_FEATURE_PCID)) + return false; + + /* + * Only do global kernel image for pti=auto. Do the most + * secure thing (not global) if pti=on specified. + */ + if (pti_mode != PTI_AUTO) + return false; + + /* + * K8 may not tolerate the cleared _PAGE_RW on the userspace + * global kernel image pages. Do the safe thing (disable + * global kernel image). This is unlikely to ever be + * noticed because PTI is disabled by default on AMD CPUs. + */ + if (boot_cpu_has(X86_FEATURE_K8)) + return false; + + /* + * RANDSTRUCT derives its hardening benefits from the + * attacker's lack of knowledge about the layout of kernel + * data structures. Keep the kernel image non-global in + * cases where RANDSTRUCT is in use to help keep the layout a + * secret. + */ + if (IS_ENABLED(CONFIG_RANDSTRUCT)) + return false; + + return true; +} + +/* + * For some configurations, map all of kernel text into the user page + * tables. This reduces TLB misses, especially on non-PCID systems. + */ +static void pti_clone_kernel_text(void) +{ + /* + * rodata is part of the kernel image and is normally + * readable on the filesystem or on the web. But, do not + * clone the areas past rodata, they might contain secrets. + */ + unsigned long start = PFN_ALIGN(_text); + unsigned long end_clone = (unsigned long)__end_rodata_aligned; + unsigned long end_global = PFN_ALIGN((unsigned long)_etext); + + if (!pti_kernel_image_global_ok()) + return; + + pr_debug("mapping partial kernel image into user address space\n"); + + /* + * Note that this will undo _some_ of the work that + * pti_set_kernel_image_nonglobal() did to clear the + * global bit. + */ + pti_clone_pgtable(start, end_clone, PTI_LEVEL_KERNEL_IMAGE); + + /* + * pti_clone_pgtable() will set the global bit in any PMDs + * that it clones, but we also need to get any PTEs in + * the last level for areas that are not huge-page-aligned. + */ + + /* Set the global bit for normal non-__init kernel text: */ + set_memory_global(start, (end_global - start) >> PAGE_SHIFT); +} + +static void pti_set_kernel_image_nonglobal(void) +{ + /* + * The identity map is created with PMDs, regardless of the + * actual length of the kernel. We need to clear + * _PAGE_GLOBAL up to a PMD boundary, not just to the end + * of the image. + */ + unsigned long start = PFN_ALIGN(_text); + unsigned long end = ALIGN((unsigned long)_end, PMD_SIZE); + + /* + * This clears _PAGE_GLOBAL from the entire kernel image. + * pti_clone_kernel_text() map put _PAGE_GLOBAL back for + * areas that are mapped to userspace. + */ + set_memory_nonglobal(start, (end - start) >> PAGE_SHIFT); +} + +/* + * Initialize kernel page table isolation + */ +void __init pti_init(void) +{ + if (!boot_cpu_has(X86_FEATURE_PTI)) + return; + + pr_info("enabled\n"); + +#ifdef CONFIG_X86_32 + /* + * We check for X86_FEATURE_PCID here. But the init-code will + * clear the feature flag on 32 bit because the feature is not + * supported on 32 bit anyway. To print the warning we need to + * check with cpuid directly again. + */ + if (cpuid_ecx(0x1) & BIT(17)) { + /* Use printk to work around pr_fmt() */ + printk(KERN_WARNING "\n"); + printk(KERN_WARNING "************************************************************\n"); + printk(KERN_WARNING "** WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! **\n"); + printk(KERN_WARNING "** **\n"); + printk(KERN_WARNING "** You are using 32-bit PTI on a 64-bit PCID-capable CPU. **\n"); + printk(KERN_WARNING "** Your performance will increase dramatically if you **\n"); + printk(KERN_WARNING "** switch to a 64-bit kernel! **\n"); + printk(KERN_WARNING "** **\n"); + printk(KERN_WARNING "** WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! **\n"); + printk(KERN_WARNING "************************************************************\n"); + } +#endif + + pti_clone_user_shared(); + + /* Undo all global bits from the init pagetables in head_64.S: */ + pti_set_kernel_image_nonglobal(); + /* Replace some of the global bits just for shared entry text: */ + pti_clone_entry_text(); + pti_setup_espfix64(); + pti_setup_vsyscall(); +} + +/* + * Finalize the kernel mappings in the userspace page-table. Some of the + * mappings for the kernel image might have changed since pti_init() + * cloned them. This is because parts of the kernel image have been + * mapped RO and/or NX. These changes need to be cloned again to the + * userspace page-table. + */ +void pti_finalize(void) +{ + if (!boot_cpu_has(X86_FEATURE_PTI)) + return; + /* + * We need to clone everything (again) that maps parts of the + * kernel image. + */ + pti_clone_entry_text(); + pti_clone_kernel_text(); + + debug_checkwx_user(); +} diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c new file mode 100644 index 0000000000..9c52a95937 --- /dev/null +++ b/arch/x86/mm/srat.c @@ -0,0 +1,113 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * ACPI 3.0 based NUMA setup + * Copyright 2004 Andi Kleen, SuSE Labs. + * + * Reads the ACPI SRAT table to figure out what memory belongs to which CPUs. + * + * Called from acpi_numa_init while reading the SRAT and SLIT tables. + * Assumes all memory regions belonging to a single proximity domain + * are in one chunk. Holes between them will be included in the node. + */ + +#include <linux/kernel.h> +#include <linux/acpi.h> +#include <linux/mmzone.h> +#include <linux/bitmap.h> +#include <linux/init.h> +#include <linux/topology.h> +#include <linux/mm.h> +#include <asm/proto.h> +#include <asm/numa.h> +#include <asm/e820/api.h> +#include <asm/apic.h> +#include <asm/uv/uv.h> + +/* Callback for Proximity Domain -> x2APIC mapping */ +void __init +acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa) +{ + int pxm, node; + int apic_id; + + if (srat_disabled()) + return; + if (pa->header.length < sizeof(struct acpi_srat_x2apic_cpu_affinity)) { + bad_srat(); + return; + } + if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0) + return; + pxm = pa->proximity_domain; + apic_id = pa->apic_id; + if (!apic_id_valid(apic_id)) { + pr_info("SRAT: PXM %u -> X2APIC 0x%04x ignored\n", pxm, apic_id); + return; + } + node = acpi_map_pxm_to_node(pxm); + if (node < 0) { + printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm); + bad_srat(); + return; + } + + if (apic_id >= MAX_LOCAL_APIC) { + printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node); + return; + } + set_apicid_to_node(apic_id, node); + node_set(node, numa_nodes_parsed); + printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u\n", + pxm, apic_id, node); +} + +/* Callback for Proximity Domain -> LAPIC mapping */ +void __init +acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa) +{ + int pxm, node; + int apic_id; + + if (srat_disabled()) + return; + if (pa->header.length != sizeof(struct acpi_srat_cpu_affinity)) { + bad_srat(); + return; + } + if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0) + return; + pxm = pa->proximity_domain_lo; + if (acpi_srat_revision >= 2) + pxm |= *((unsigned int*)pa->proximity_domain_hi) << 8; + node = acpi_map_pxm_to_node(pxm); + if (node < 0) { + printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm); + bad_srat(); + return; + } + + if (get_uv_system_type() >= UV_X2APIC) + apic_id = (pa->apic_id << 8) | pa->local_sapic_eid; + else + apic_id = pa->apic_id; + + if (apic_id >= MAX_LOCAL_APIC) { + printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node); + return; + } + + set_apicid_to_node(apic_id, node); + node_set(node, numa_nodes_parsed); + printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u\n", + pxm, apic_id, node); +} + +int __init x86_acpi_numa_init(void) +{ + int ret; + + ret = acpi_numa_init(); + if (ret < 0) + return ret; + return srat_disabled() ? -EINVAL : 0; +} diff --git a/arch/x86/mm/testmmiotrace.c b/arch/x86/mm/testmmiotrace.c new file mode 100644 index 0000000000..bda73cb7a0 --- /dev/null +++ b/arch/x86/mm/testmmiotrace.c @@ -0,0 +1,146 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Written by Pekka Paalanen, 2008-2009 <pq@iki.fi> + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/module.h> +#include <linux/io.h> +#include <linux/mmiotrace.h> +#include <linux/security.h> + +static unsigned long mmio_address; +module_param_hw(mmio_address, ulong, iomem, 0); +MODULE_PARM_DESC(mmio_address, " Start address of the mapping of 16 kB " + "(or 8 MB if read_far is non-zero)."); + +static unsigned long read_far = 0x400100; +module_param(read_far, ulong, 0); +MODULE_PARM_DESC(read_far, " Offset of a 32-bit read within 8 MB " + "(default: 0x400100)."); + +static unsigned v16(unsigned i) +{ + return i * 12 + 7; +} + +static unsigned v32(unsigned i) +{ + return i * 212371 + 13; +} + +static void do_write_test(void __iomem *p) +{ + unsigned int i; + pr_info("write test.\n"); + mmiotrace_printk("Write test.\n"); + + for (i = 0; i < 256; i++) + iowrite8(i, p + i); + + for (i = 1024; i < (5 * 1024); i += 2) + iowrite16(v16(i), p + i); + + for (i = (5 * 1024); i < (16 * 1024); i += 4) + iowrite32(v32(i), p + i); +} + +static void do_read_test(void __iomem *p) +{ + unsigned int i; + unsigned errs[3] = { 0 }; + pr_info("read test.\n"); + mmiotrace_printk("Read test.\n"); + + for (i = 0; i < 256; i++) + if (ioread8(p + i) != i) + ++errs[0]; + + for (i = 1024; i < (5 * 1024); i += 2) + if (ioread16(p + i) != v16(i)) + ++errs[1]; + + for (i = (5 * 1024); i < (16 * 1024); i += 4) + if (ioread32(p + i) != v32(i)) + ++errs[2]; + + mmiotrace_printk("Read errors: 8-bit %d, 16-bit %d, 32-bit %d.\n", + errs[0], errs[1], errs[2]); +} + +static void do_read_far_test(void __iomem *p) +{ + pr_info("read far test.\n"); + mmiotrace_printk("Read far test.\n"); + + ioread32(p + read_far); +} + +static void do_test(unsigned long size) +{ + void __iomem *p = ioremap(mmio_address, size); + if (!p) { + pr_err("could not ioremap, aborting.\n"); + return; + } + mmiotrace_printk("ioremap returned %p.\n", p); + do_write_test(p); + do_read_test(p); + if (read_far && read_far < size - 4) + do_read_far_test(p); + iounmap(p); +} + +/* + * Tests how mmiotrace behaves in face of multiple ioremap / iounmaps in + * a short time. We had a bug in deferred freeing procedure which tried + * to free this region multiple times (ioremap can reuse the same address + * for many mappings). + */ +static void do_test_bulk_ioremapping(void) +{ + void __iomem *p; + int i; + + for (i = 0; i < 10; ++i) { + p = ioremap(mmio_address, PAGE_SIZE); + if (p) + iounmap(p); + } + + /* Force freeing. If it will crash we will know why. */ + synchronize_rcu(); +} + +static int __init init(void) +{ + unsigned long size = (read_far) ? (8 << 20) : (16 << 10); + int ret = security_locked_down(LOCKDOWN_MMIOTRACE); + + if (ret) + return ret; + + if (mmio_address == 0) { + pr_err("you have to use the module argument mmio_address.\n"); + pr_err("DO NOT LOAD THIS MODULE UNLESS YOU REALLY KNOW WHAT YOU ARE DOING!\n"); + return -ENXIO; + } + + pr_warn("WARNING: mapping %lu kB @ 0x%08lx in PCI address space, " + "and writing 16 kB of rubbish in there.\n", + size >> 10, mmio_address); + do_test(size); + do_test_bulk_ioremapping(); + pr_info("All done.\n"); + return 0; +} + +static void __exit cleanup(void) +{ + pr_debug("unloaded.\n"); +} + +module_init(init); +module_exit(cleanup); +MODULE_LICENSE("GPL"); diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c new file mode 100644 index 0000000000..453ea95b66 --- /dev/null +++ b/arch/x86/mm/tlb.c @@ -0,0 +1,1353 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <linux/init.h> + +#include <linux/mm.h> +#include <linux/spinlock.h> +#include <linux/smp.h> +#include <linux/interrupt.h> +#include <linux/export.h> +#include <linux/cpu.h> +#include <linux/debugfs.h> +#include <linux/sched/smt.h> +#include <linux/task_work.h> +#include <linux/mmu_notifier.h> + +#include <asm/tlbflush.h> +#include <asm/mmu_context.h> +#include <asm/nospec-branch.h> +#include <asm/cache.h> +#include <asm/cacheflush.h> +#include <asm/apic.h> +#include <asm/perf_event.h> + +#include "mm_internal.h" + +#ifdef CONFIG_PARAVIRT +# define STATIC_NOPV +#else +# define STATIC_NOPV static +# define __flush_tlb_local native_flush_tlb_local +# define __flush_tlb_global native_flush_tlb_global +# define __flush_tlb_one_user(addr) native_flush_tlb_one_user(addr) +# define __flush_tlb_multi(msk, info) native_flush_tlb_multi(msk, info) +#endif + +/* + * TLB flushing, formerly SMP-only + * c/o Linus Torvalds. + * + * These mean you can really definitely utterly forget about + * writing to user space from interrupts. (Its not allowed anyway). + * + * Optimizations Manfred Spraul <manfred@colorfullife.com> + * + * More scalable flush, from Andi Kleen + * + * Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi + */ + +/* + * Bits to mangle the TIF_SPEC_* state into the mm pointer which is + * stored in cpu_tlb_state.last_user_mm_spec. + */ +#define LAST_USER_MM_IBPB 0x1UL +#define LAST_USER_MM_L1D_FLUSH 0x2UL +#define LAST_USER_MM_SPEC_MASK (LAST_USER_MM_IBPB | LAST_USER_MM_L1D_FLUSH) + +/* Bits to set when tlbstate and flush is (re)initialized */ +#define LAST_USER_MM_INIT LAST_USER_MM_IBPB + +/* + * The x86 feature is called PCID (Process Context IDentifier). It is similar + * to what is traditionally called ASID on the RISC processors. + * + * We don't use the traditional ASID implementation, where each process/mm gets + * its own ASID and flush/restart when we run out of ASID space. + * + * Instead we have a small per-cpu array of ASIDs and cache the last few mm's + * that came by on this CPU, allowing cheaper switch_mm between processes on + * this CPU. + * + * We end up with different spaces for different things. To avoid confusion we + * use different names for each of them: + * + * ASID - [0, TLB_NR_DYN_ASIDS-1] + * the canonical identifier for an mm + * + * kPCID - [1, TLB_NR_DYN_ASIDS] + * the value we write into the PCID part of CR3; corresponds to the + * ASID+1, because PCID 0 is special. + * + * uPCID - [2048 + 1, 2048 + TLB_NR_DYN_ASIDS] + * for KPTI each mm has two address spaces and thus needs two + * PCID values, but we can still do with a single ASID denomination + * for each mm. Corresponds to kPCID + 2048. + * + */ + +/* There are 12 bits of space for ASIDS in CR3 */ +#define CR3_HW_ASID_BITS 12 + +/* + * When enabled, PAGE_TABLE_ISOLATION consumes a single bit for + * user/kernel switches + */ +#ifdef CONFIG_PAGE_TABLE_ISOLATION +# define PTI_CONSUMED_PCID_BITS 1 +#else +# define PTI_CONSUMED_PCID_BITS 0 +#endif + +#define CR3_AVAIL_PCID_BITS (X86_CR3_PCID_BITS - PTI_CONSUMED_PCID_BITS) + +/* + * ASIDs are zero-based: 0->MAX_AVAIL_ASID are valid. -1 below to account + * for them being zero-based. Another -1 is because PCID 0 is reserved for + * use by non-PCID-aware users. + */ +#define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_PCID_BITS) - 2) + +/* + * Given @asid, compute kPCID + */ +static inline u16 kern_pcid(u16 asid) +{ + VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE); + +#ifdef CONFIG_PAGE_TABLE_ISOLATION + /* + * Make sure that the dynamic ASID space does not conflict with the + * bit we are using to switch between user and kernel ASIDs. + */ + BUILD_BUG_ON(TLB_NR_DYN_ASIDS >= (1 << X86_CR3_PTI_PCID_USER_BIT)); + + /* + * The ASID being passed in here should have respected the + * MAX_ASID_AVAILABLE and thus never have the switch bit set. + */ + VM_WARN_ON_ONCE(asid & (1 << X86_CR3_PTI_PCID_USER_BIT)); +#endif + /* + * The dynamically-assigned ASIDs that get passed in are small + * (<TLB_NR_DYN_ASIDS). They never have the high switch bit set, + * so do not bother to clear it. + * + * If PCID is on, ASID-aware code paths put the ASID+1 into the + * PCID bits. This serves two purposes. It prevents a nasty + * situation in which PCID-unaware code saves CR3, loads some other + * value (with PCID == 0), and then restores CR3, thus corrupting + * the TLB for ASID 0 if the saved ASID was nonzero. It also means + * that any bugs involving loading a PCID-enabled CR3 with + * CR4.PCIDE off will trigger deterministically. + */ + return asid + 1; +} + +/* + * Given @asid, compute uPCID + */ +static inline u16 user_pcid(u16 asid) +{ + u16 ret = kern_pcid(asid); +#ifdef CONFIG_PAGE_TABLE_ISOLATION + ret |= 1 << X86_CR3_PTI_PCID_USER_BIT; +#endif + return ret; +} + +static inline unsigned long build_cr3(pgd_t *pgd, u16 asid, unsigned long lam) +{ + unsigned long cr3 = __sme_pa(pgd) | lam; + + if (static_cpu_has(X86_FEATURE_PCID)) { + VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE); + cr3 |= kern_pcid(asid); + } else { + VM_WARN_ON_ONCE(asid != 0); + } + + return cr3; +} + +static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid, + unsigned long lam) +{ + /* + * Use boot_cpu_has() instead of this_cpu_has() as this function + * might be called during early boot. This should work even after + * boot because all CPU's the have same capabilities: + */ + VM_WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_PCID)); + return build_cr3(pgd, asid, lam) | CR3_NOFLUSH; +} + +/* + * We get here when we do something requiring a TLB invalidation + * but could not go invalidate all of the contexts. We do the + * necessary invalidation by clearing out the 'ctx_id' which + * forces a TLB flush when the context is loaded. + */ +static void clear_asid_other(void) +{ + u16 asid; + + /* + * This is only expected to be set if we have disabled + * kernel _PAGE_GLOBAL pages. + */ + if (!static_cpu_has(X86_FEATURE_PTI)) { + WARN_ON_ONCE(1); + return; + } + + for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) { + /* Do not need to flush the current asid */ + if (asid == this_cpu_read(cpu_tlbstate.loaded_mm_asid)) + continue; + /* + * Make sure the next time we go to switch to + * this asid, we do a flush: + */ + this_cpu_write(cpu_tlbstate.ctxs[asid].ctx_id, 0); + } + this_cpu_write(cpu_tlbstate.invalidate_other, false); +} + +atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1); + + +static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen, + u16 *new_asid, bool *need_flush) +{ + u16 asid; + + if (!static_cpu_has(X86_FEATURE_PCID)) { + *new_asid = 0; + *need_flush = true; + return; + } + + if (this_cpu_read(cpu_tlbstate.invalidate_other)) + clear_asid_other(); + + for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) { + if (this_cpu_read(cpu_tlbstate.ctxs[asid].ctx_id) != + next->context.ctx_id) + continue; + + *new_asid = asid; + *need_flush = (this_cpu_read(cpu_tlbstate.ctxs[asid].tlb_gen) < + next_tlb_gen); + return; + } + + /* + * We don't currently own an ASID slot on this CPU. + * Allocate a slot. + */ + *new_asid = this_cpu_add_return(cpu_tlbstate.next_asid, 1) - 1; + if (*new_asid >= TLB_NR_DYN_ASIDS) { + *new_asid = 0; + this_cpu_write(cpu_tlbstate.next_asid, 1); + } + *need_flush = true; +} + +/* + * Given an ASID, flush the corresponding user ASID. We can delay this + * until the next time we switch to it. + * + * See SWITCH_TO_USER_CR3. + */ +static inline void invalidate_user_asid(u16 asid) +{ + /* There is no user ASID if address space separation is off */ + if (!IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION)) + return; + + /* + * We only have a single ASID if PCID is off and the CR3 + * write will have flushed it. + */ + if (!cpu_feature_enabled(X86_FEATURE_PCID)) + return; + + if (!static_cpu_has(X86_FEATURE_PTI)) + return; + + __set_bit(kern_pcid(asid), + (unsigned long *)this_cpu_ptr(&cpu_tlbstate.user_pcid_flush_mask)); +} + +static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, unsigned long lam, + bool need_flush) +{ + unsigned long new_mm_cr3; + + if (need_flush) { + invalidate_user_asid(new_asid); + new_mm_cr3 = build_cr3(pgdir, new_asid, lam); + } else { + new_mm_cr3 = build_cr3_noflush(pgdir, new_asid, lam); + } + + /* + * Caution: many callers of this function expect + * that load_cr3() is serializing and orders TLB + * fills with respect to the mm_cpumask writes. + */ + write_cr3(new_mm_cr3); +} + +void leave_mm(int cpu) +{ + struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm); + + /* + * It's plausible that we're in lazy TLB mode while our mm is init_mm. + * If so, our callers still expect us to flush the TLB, but there + * aren't any user TLB entries in init_mm to worry about. + * + * This needs to happen before any other sanity checks due to + * intel_idle's shenanigans. + */ + if (loaded_mm == &init_mm) + return; + + /* Warn if we're not lazy. */ + WARN_ON(!this_cpu_read(cpu_tlbstate_shared.is_lazy)); + + switch_mm(NULL, &init_mm, NULL); +} +EXPORT_SYMBOL_GPL(leave_mm); + +void switch_mm(struct mm_struct *prev, struct mm_struct *next, + struct task_struct *tsk) +{ + unsigned long flags; + + local_irq_save(flags); + switch_mm_irqs_off(prev, next, tsk); + local_irq_restore(flags); +} + +/* + * Invoked from return to user/guest by a task that opted-in to L1D + * flushing but ended up running on an SMT enabled core due to wrong + * affinity settings or CPU hotplug. This is part of the paranoid L1D flush + * contract which this task requested. + */ +static void l1d_flush_force_sigbus(struct callback_head *ch) +{ + force_sig(SIGBUS); +} + +static void l1d_flush_evaluate(unsigned long prev_mm, unsigned long next_mm, + struct task_struct *next) +{ + /* Flush L1D if the outgoing task requests it */ + if (prev_mm & LAST_USER_MM_L1D_FLUSH) + wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH); + + /* Check whether the incoming task opted in for L1D flush */ + if (likely(!(next_mm & LAST_USER_MM_L1D_FLUSH))) + return; + + /* + * Validate that it is not running on an SMT sibling as this would + * make the excercise pointless because the siblings share L1D. If + * it runs on a SMT sibling, notify it with SIGBUS on return to + * user/guest + */ + if (this_cpu_read(cpu_info.smt_active)) { + clear_ti_thread_flag(&next->thread_info, TIF_SPEC_L1D_FLUSH); + next->l1d_flush_kill.func = l1d_flush_force_sigbus; + task_work_add(next, &next->l1d_flush_kill, TWA_RESUME); + } +} + +static unsigned long mm_mangle_tif_spec_bits(struct task_struct *next) +{ + unsigned long next_tif = read_task_thread_flags(next); + unsigned long spec_bits = (next_tif >> TIF_SPEC_IB) & LAST_USER_MM_SPEC_MASK; + + /* + * Ensure that the bit shift above works as expected and the two flags + * end up in bit 0 and 1. + */ + BUILD_BUG_ON(TIF_SPEC_L1D_FLUSH != TIF_SPEC_IB + 1); + + return (unsigned long)next->mm | spec_bits; +} + +static void cond_mitigation(struct task_struct *next) +{ + unsigned long prev_mm, next_mm; + + if (!next || !next->mm) + return; + + next_mm = mm_mangle_tif_spec_bits(next); + prev_mm = this_cpu_read(cpu_tlbstate.last_user_mm_spec); + + /* + * Avoid user/user BTB poisoning by flushing the branch predictor + * when switching between processes. This stops one process from + * doing Spectre-v2 attacks on another. + * + * Both, the conditional and the always IBPB mode use the mm + * pointer to avoid the IBPB when switching between tasks of the + * same process. Using the mm pointer instead of mm->context.ctx_id + * opens a hypothetical hole vs. mm_struct reuse, which is more or + * less impossible to control by an attacker. Aside of that it + * would only affect the first schedule so the theoretically + * exposed data is not really interesting. + */ + if (static_branch_likely(&switch_mm_cond_ibpb)) { + /* + * This is a bit more complex than the always mode because + * it has to handle two cases: + * + * 1) Switch from a user space task (potential attacker) + * which has TIF_SPEC_IB set to a user space task + * (potential victim) which has TIF_SPEC_IB not set. + * + * 2) Switch from a user space task (potential attacker) + * which has TIF_SPEC_IB not set to a user space task + * (potential victim) which has TIF_SPEC_IB set. + * + * This could be done by unconditionally issuing IBPB when + * a task which has TIF_SPEC_IB set is either scheduled in + * or out. Though that results in two flushes when: + * + * - the same user space task is scheduled out and later + * scheduled in again and only a kernel thread ran in + * between. + * + * - a user space task belonging to the same process is + * scheduled in after a kernel thread ran in between + * + * - a user space task belonging to the same process is + * scheduled in immediately. + * + * Optimize this with reasonably small overhead for the + * above cases. Mangle the TIF_SPEC_IB bit into the mm + * pointer of the incoming task which is stored in + * cpu_tlbstate.last_user_mm_spec for comparison. + * + * Issue IBPB only if the mm's are different and one or + * both have the IBPB bit set. + */ + if (next_mm != prev_mm && + (next_mm | prev_mm) & LAST_USER_MM_IBPB) + indirect_branch_prediction_barrier(); + } + + if (static_branch_unlikely(&switch_mm_always_ibpb)) { + /* + * Only flush when switching to a user space task with a + * different context than the user space task which ran + * last on this CPU. + */ + if ((prev_mm & ~LAST_USER_MM_SPEC_MASK) != + (unsigned long)next->mm) + indirect_branch_prediction_barrier(); + } + + if (static_branch_unlikely(&switch_mm_cond_l1d_flush)) { + /* + * Flush L1D when the outgoing task requested it and/or + * check whether the incoming task requested L1D flushing + * and ended up on an SMT sibling. + */ + if (unlikely((prev_mm | next_mm) & LAST_USER_MM_L1D_FLUSH)) + l1d_flush_evaluate(prev_mm, next_mm, next); + } + + this_cpu_write(cpu_tlbstate.last_user_mm_spec, next_mm); +} + +#ifdef CONFIG_PERF_EVENTS +static inline void cr4_update_pce_mm(struct mm_struct *mm) +{ + if (static_branch_unlikely(&rdpmc_always_available_key) || + (!static_branch_unlikely(&rdpmc_never_available_key) && + atomic_read(&mm->context.perf_rdpmc_allowed))) { + /* + * Clear the existing dirty counters to + * prevent the leak for an RDPMC task. + */ + perf_clear_dirty_counters(); + cr4_set_bits_irqsoff(X86_CR4_PCE); + } else + cr4_clear_bits_irqsoff(X86_CR4_PCE); +} + +void cr4_update_pce(void *ignored) +{ + cr4_update_pce_mm(this_cpu_read(cpu_tlbstate.loaded_mm)); +} + +#else +static inline void cr4_update_pce_mm(struct mm_struct *mm) { } +#endif + +void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, + struct task_struct *tsk) +{ + struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm); + u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); + unsigned long new_lam = mm_lam_cr3_mask(next); + bool was_lazy = this_cpu_read(cpu_tlbstate_shared.is_lazy); + unsigned cpu = smp_processor_id(); + u64 next_tlb_gen; + bool need_flush; + u16 new_asid; + + /* + * NB: The scheduler will call us with prev == next when switching + * from lazy TLB mode to normal mode if active_mm isn't changing. + * When this happens, we don't assume that CR3 (and hence + * cpu_tlbstate.loaded_mm) matches next. + * + * NB: leave_mm() calls us with prev == NULL and tsk == NULL. + */ + + /* We don't want flush_tlb_func() to run concurrently with us. */ + if (IS_ENABLED(CONFIG_PROVE_LOCKING)) + WARN_ON_ONCE(!irqs_disabled()); + + /* + * Verify that CR3 is what we think it is. This will catch + * hypothetical buggy code that directly switches to swapper_pg_dir + * without going through leave_mm() / switch_mm_irqs_off() or that + * does something like write_cr3(read_cr3_pa()). + * + * Only do this check if CONFIG_DEBUG_VM=y because __read_cr3() + * isn't free. + */ +#ifdef CONFIG_DEBUG_VM + if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev->pgd, prev_asid, + tlbstate_lam_cr3_mask()))) { + /* + * If we were to BUG here, we'd be very likely to kill + * the system so hard that we don't see the call trace. + * Try to recover instead by ignoring the error and doing + * a global flush to minimize the chance of corruption. + * + * (This is far from being a fully correct recovery. + * Architecturally, the CPU could prefetch something + * back into an incorrect ASID slot and leave it there + * to cause trouble down the road. It's better than + * nothing, though.) + */ + __flush_tlb_all(); + } +#endif + if (was_lazy) + this_cpu_write(cpu_tlbstate_shared.is_lazy, false); + + /* + * The membarrier system call requires a full memory barrier and + * core serialization before returning to user-space, after + * storing to rq->curr, when changing mm. This is because + * membarrier() sends IPIs to all CPUs that are in the target mm + * to make them issue memory barriers. However, if another CPU + * switches to/from the target mm concurrently with + * membarrier(), it can cause that CPU not to receive an IPI + * when it really should issue a memory barrier. Writing to CR3 + * provides that full memory barrier and core serializing + * instruction. + */ + if (real_prev == next) { + /* Not actually switching mm's */ + VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) != + next->context.ctx_id); + + /* + * If this races with another thread that enables lam, 'new_lam' + * might not match tlbstate_lam_cr3_mask(). + */ + + /* + * Even in lazy TLB mode, the CPU should stay set in the + * mm_cpumask. The TLB shootdown code can figure out from + * cpu_tlbstate_shared.is_lazy whether or not to send an IPI. + */ + if (WARN_ON_ONCE(real_prev != &init_mm && + !cpumask_test_cpu(cpu, mm_cpumask(next)))) + cpumask_set_cpu(cpu, mm_cpumask(next)); + + /* + * If the CPU is not in lazy TLB mode, we are just switching + * from one thread in a process to another thread in the same + * process. No TLB flush required. + */ + if (!was_lazy) + return; + + /* + * Read the tlb_gen to check whether a flush is needed. + * If the TLB is up to date, just use it. + * The barrier synchronizes with the tlb_gen increment in + * the TLB shootdown code. + */ + smp_mb(); + next_tlb_gen = atomic64_read(&next->context.tlb_gen); + if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) == + next_tlb_gen) + return; + + /* + * TLB contents went out of date while we were in lazy + * mode. Fall through to the TLB switching code below. + */ + new_asid = prev_asid; + need_flush = true; + } else { + /* + * Apply process to process speculation vulnerability + * mitigations if applicable. + */ + cond_mitigation(tsk); + + /* + * Stop remote flushes for the previous mm. + * Skip kernel threads; we never send init_mm TLB flushing IPIs, + * but the bitmap manipulation can cause cache line contention. + */ + if (real_prev != &init_mm) { + VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu, + mm_cpumask(real_prev))); + cpumask_clear_cpu(cpu, mm_cpumask(real_prev)); + } + + /* + * Start remote flushes and then read tlb_gen. + */ + if (next != &init_mm) + cpumask_set_cpu(cpu, mm_cpumask(next)); + next_tlb_gen = atomic64_read(&next->context.tlb_gen); + + choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush); + + /* Let nmi_uaccess_okay() know that we're changing CR3. */ + this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING); + barrier(); + } + + set_tlbstate_lam_mode(next); + if (need_flush) { + this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); + this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); + load_new_mm_cr3(next->pgd, new_asid, new_lam, true); + + trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); + } else { + /* The new ASID is already up to date. */ + load_new_mm_cr3(next->pgd, new_asid, new_lam, false); + + trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, 0); + } + + /* Make sure we write CR3 before loaded_mm. */ + barrier(); + + this_cpu_write(cpu_tlbstate.loaded_mm, next); + this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid); + + if (next != real_prev) { + cr4_update_pce_mm(next); + switch_ldt(real_prev, next); + } +} + +/* + * Please ignore the name of this function. It should be called + * switch_to_kernel_thread(). + * + * enter_lazy_tlb() is a hint from the scheduler that we are entering a + * kernel thread or other context without an mm. Acceptable implementations + * include doing nothing whatsoever, switching to init_mm, or various clever + * lazy tricks to try to minimize TLB flushes. + * + * The scheduler reserves the right to call enter_lazy_tlb() several times + * in a row. It will notify us that we're going back to a real mm by + * calling switch_mm_irqs_off(). + */ +void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) +{ + if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm) + return; + + this_cpu_write(cpu_tlbstate_shared.is_lazy, true); +} + +/* + * Call this when reinitializing a CPU. It fixes the following potential + * problems: + * + * - The ASID changed from what cpu_tlbstate thinks it is (most likely + * because the CPU was taken down and came back up with CR3's PCID + * bits clear. CPU hotplug can do this. + * + * - The TLB contains junk in slots corresponding to inactive ASIDs. + * + * - The CPU went so far out to lunch that it may have missed a TLB + * flush. + */ +void initialize_tlbstate_and_flush(void) +{ + int i; + struct mm_struct *mm = this_cpu_read(cpu_tlbstate.loaded_mm); + u64 tlb_gen = atomic64_read(&init_mm.context.tlb_gen); + unsigned long cr3 = __read_cr3(); + + /* Assert that CR3 already references the right mm. */ + WARN_ON((cr3 & CR3_ADDR_MASK) != __pa(mm->pgd)); + + /* LAM expected to be disabled */ + WARN_ON(cr3 & (X86_CR3_LAM_U48 | X86_CR3_LAM_U57)); + WARN_ON(mm_lam_cr3_mask(mm)); + + /* + * Assert that CR4.PCIDE is set if needed. (CR4.PCIDE initialization + * doesn't work like other CR4 bits because it can only be set from + * long mode.) + */ + WARN_ON(boot_cpu_has(X86_FEATURE_PCID) && + !(cr4_read_shadow() & X86_CR4_PCIDE)); + + /* Disable LAM, force ASID 0 and force a TLB flush. */ + write_cr3(build_cr3(mm->pgd, 0, 0)); + + /* Reinitialize tlbstate. */ + this_cpu_write(cpu_tlbstate.last_user_mm_spec, LAST_USER_MM_INIT); + this_cpu_write(cpu_tlbstate.loaded_mm_asid, 0); + this_cpu_write(cpu_tlbstate.next_asid, 1); + this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, mm->context.ctx_id); + this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, tlb_gen); + set_tlbstate_lam_mode(mm); + + for (i = 1; i < TLB_NR_DYN_ASIDS; i++) + this_cpu_write(cpu_tlbstate.ctxs[i].ctx_id, 0); +} + +/* + * flush_tlb_func()'s memory ordering requirement is that any + * TLB fills that happen after we flush the TLB are ordered after we + * read active_mm's tlb_gen. We don't need any explicit barriers + * because all x86 flush operations are serializing and the + * atomic64_read operation won't be reordered by the compiler. + */ +static void flush_tlb_func(void *info) +{ + /* + * We have three different tlb_gen values in here. They are: + * + * - mm_tlb_gen: the latest generation. + * - local_tlb_gen: the generation that this CPU has already caught + * up to. + * - f->new_tlb_gen: the generation that the requester of the flush + * wants us to catch up to. + */ + const struct flush_tlb_info *f = info; + struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm); + u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); + u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen); + bool local = smp_processor_id() == f->initiating_cpu; + unsigned long nr_invalidate = 0; + u64 mm_tlb_gen; + + /* This code cannot presently handle being reentered. */ + VM_WARN_ON(!irqs_disabled()); + + if (!local) { + inc_irq_stat(irq_tlb_count); + count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); + + /* Can only happen on remote CPUs */ + if (f->mm && f->mm != loaded_mm) + return; + } + + if (unlikely(loaded_mm == &init_mm)) + return; + + VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) != + loaded_mm->context.ctx_id); + + if (this_cpu_read(cpu_tlbstate_shared.is_lazy)) { + /* + * We're in lazy mode. We need to at least flush our + * paging-structure cache to avoid speculatively reading + * garbage into our TLB. Since switching to init_mm is barely + * slower than a minimal flush, just switch to init_mm. + * + * This should be rare, with native_flush_tlb_multi() skipping + * IPIs to lazy TLB mode CPUs. + */ + switch_mm_irqs_off(NULL, &init_mm, NULL); + return; + } + + if (unlikely(f->new_tlb_gen != TLB_GENERATION_INVALID && + f->new_tlb_gen <= local_tlb_gen)) { + /* + * The TLB is already up to date in respect to f->new_tlb_gen. + * While the core might be still behind mm_tlb_gen, checking + * mm_tlb_gen unnecessarily would have negative caching effects + * so avoid it. + */ + return; + } + + /* + * Defer mm_tlb_gen reading as long as possible to avoid cache + * contention. + */ + mm_tlb_gen = atomic64_read(&loaded_mm->context.tlb_gen); + + if (unlikely(local_tlb_gen == mm_tlb_gen)) { + /* + * There's nothing to do: we're already up to date. This can + * happen if two concurrent flushes happen -- the first flush to + * be handled can catch us all the way up, leaving no work for + * the second flush. + */ + goto done; + } + + WARN_ON_ONCE(local_tlb_gen > mm_tlb_gen); + WARN_ON_ONCE(f->new_tlb_gen > mm_tlb_gen); + + /* + * If we get to this point, we know that our TLB is out of date. + * This does not strictly imply that we need to flush (it's + * possible that f->new_tlb_gen <= local_tlb_gen), but we're + * going to need to flush in the very near future, so we might + * as well get it over with. + * + * The only question is whether to do a full or partial flush. + * + * We do a partial flush if requested and two extra conditions + * are met: + * + * 1. f->new_tlb_gen == local_tlb_gen + 1. We have an invariant that + * we've always done all needed flushes to catch up to + * local_tlb_gen. If, for example, local_tlb_gen == 2 and + * f->new_tlb_gen == 3, then we know that the flush needed to bring + * us up to date for tlb_gen 3 is the partial flush we're + * processing. + * + * As an example of why this check is needed, suppose that there + * are two concurrent flushes. The first is a full flush that + * changes context.tlb_gen from 1 to 2. The second is a partial + * flush that changes context.tlb_gen from 2 to 3. If they get + * processed on this CPU in reverse order, we'll see + * local_tlb_gen == 1, mm_tlb_gen == 3, and end != TLB_FLUSH_ALL. + * If we were to use __flush_tlb_one_user() and set local_tlb_gen to + * 3, we'd be break the invariant: we'd update local_tlb_gen above + * 1 without the full flush that's needed for tlb_gen 2. + * + * 2. f->new_tlb_gen == mm_tlb_gen. This is purely an optimization. + * Partial TLB flushes are not all that much cheaper than full TLB + * flushes, so it seems unlikely that it would be a performance win + * to do a partial flush if that won't bring our TLB fully up to + * date. By doing a full flush instead, we can increase + * local_tlb_gen all the way to mm_tlb_gen and we can probably + * avoid another flush in the very near future. + */ + if (f->end != TLB_FLUSH_ALL && + f->new_tlb_gen == local_tlb_gen + 1 && + f->new_tlb_gen == mm_tlb_gen) { + /* Partial flush */ + unsigned long addr = f->start; + + /* Partial flush cannot have invalid generations */ + VM_WARN_ON(f->new_tlb_gen == TLB_GENERATION_INVALID); + + /* Partial flush must have valid mm */ + VM_WARN_ON(f->mm == NULL); + + nr_invalidate = (f->end - f->start) >> f->stride_shift; + + while (addr < f->end) { + flush_tlb_one_user(addr); + addr += 1UL << f->stride_shift; + } + if (local) + count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_invalidate); + } else { + /* Full flush. */ + nr_invalidate = TLB_FLUSH_ALL; + + flush_tlb_local(); + if (local) + count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); + } + + /* Both paths above update our state to mm_tlb_gen. */ + this_cpu_write(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen, mm_tlb_gen); + + /* Tracing is done in a unified manner to reduce the code size */ +done: + trace_tlb_flush(!local ? TLB_REMOTE_SHOOTDOWN : + (f->mm == NULL) ? TLB_LOCAL_SHOOTDOWN : + TLB_LOCAL_MM_SHOOTDOWN, + nr_invalidate); +} + +static bool tlb_is_not_lazy(int cpu, void *data) +{ + return !per_cpu(cpu_tlbstate_shared.is_lazy, cpu); +} + +DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state_shared, cpu_tlbstate_shared); +EXPORT_PER_CPU_SYMBOL(cpu_tlbstate_shared); + +STATIC_NOPV void native_flush_tlb_multi(const struct cpumask *cpumask, + const struct flush_tlb_info *info) +{ + /* + * Do accounting and tracing. Note that there are (and have always been) + * cases in which a remote TLB flush will be traced, but eventually + * would not happen. + */ + count_vm_tlb_event(NR_TLB_REMOTE_FLUSH); + if (info->end == TLB_FLUSH_ALL) + trace_tlb_flush(TLB_REMOTE_SEND_IPI, TLB_FLUSH_ALL); + else + trace_tlb_flush(TLB_REMOTE_SEND_IPI, + (info->end - info->start) >> PAGE_SHIFT); + + /* + * If no page tables were freed, we can skip sending IPIs to + * CPUs in lazy TLB mode. They will flush the CPU themselves + * at the next context switch. + * + * However, if page tables are getting freed, we need to send the + * IPI everywhere, to prevent CPUs in lazy TLB mode from tripping + * up on the new contents of what used to be page tables, while + * doing a speculative memory access. + */ + if (info->freed_tables) + on_each_cpu_mask(cpumask, flush_tlb_func, (void *)info, true); + else + on_each_cpu_cond_mask(tlb_is_not_lazy, flush_tlb_func, + (void *)info, 1, cpumask); +} + +void flush_tlb_multi(const struct cpumask *cpumask, + const struct flush_tlb_info *info) +{ + __flush_tlb_multi(cpumask, info); +} + +/* + * See Documentation/arch/x86/tlb.rst for details. We choose 33 + * because it is large enough to cover the vast majority (at + * least 95%) of allocations, and is small enough that we are + * confident it will not cause too much overhead. Each single + * flush is about 100 ns, so this caps the maximum overhead at + * _about_ 3,000 ns. + * + * This is in units of pages. + */ +unsigned long tlb_single_page_flush_ceiling __read_mostly = 33; + +static DEFINE_PER_CPU_SHARED_ALIGNED(struct flush_tlb_info, flush_tlb_info); + +#ifdef CONFIG_DEBUG_VM +static DEFINE_PER_CPU(unsigned int, flush_tlb_info_idx); +#endif + +static struct flush_tlb_info *get_flush_tlb_info(struct mm_struct *mm, + unsigned long start, unsigned long end, + unsigned int stride_shift, bool freed_tables, + u64 new_tlb_gen) +{ + struct flush_tlb_info *info = this_cpu_ptr(&flush_tlb_info); + +#ifdef CONFIG_DEBUG_VM + /* + * Ensure that the following code is non-reentrant and flush_tlb_info + * is not overwritten. This means no TLB flushing is initiated by + * interrupt handlers and machine-check exception handlers. + */ + BUG_ON(this_cpu_inc_return(flush_tlb_info_idx) != 1); +#endif + + info->start = start; + info->end = end; + info->mm = mm; + info->stride_shift = stride_shift; + info->freed_tables = freed_tables; + info->new_tlb_gen = new_tlb_gen; + info->initiating_cpu = smp_processor_id(); + + return info; +} + +static void put_flush_tlb_info(void) +{ +#ifdef CONFIG_DEBUG_VM + /* Complete reentrancy prevention checks */ + barrier(); + this_cpu_dec(flush_tlb_info_idx); +#endif +} + +void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, + unsigned long end, unsigned int stride_shift, + bool freed_tables) +{ + struct flush_tlb_info *info; + u64 new_tlb_gen; + int cpu; + + cpu = get_cpu(); + + /* Should we flush just the requested range? */ + if ((end == TLB_FLUSH_ALL) || + ((end - start) >> stride_shift) > tlb_single_page_flush_ceiling) { + start = 0; + end = TLB_FLUSH_ALL; + } + + /* This is also a barrier that synchronizes with switch_mm(). */ + new_tlb_gen = inc_mm_tlb_gen(mm); + + info = get_flush_tlb_info(mm, start, end, stride_shift, freed_tables, + new_tlb_gen); + + /* + * flush_tlb_multi() is not optimized for the common case in which only + * a local TLB flush is needed. Optimize this use-case by calling + * flush_tlb_func_local() directly in this case. + */ + if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) { + flush_tlb_multi(mm_cpumask(mm), info); + } else if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) { + lockdep_assert_irqs_enabled(); + local_irq_disable(); + flush_tlb_func(info); + local_irq_enable(); + } + + put_flush_tlb_info(); + put_cpu(); + mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end); +} + + +static void do_flush_tlb_all(void *info) +{ + count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); + __flush_tlb_all(); +} + +void flush_tlb_all(void) +{ + count_vm_tlb_event(NR_TLB_REMOTE_FLUSH); + on_each_cpu(do_flush_tlb_all, NULL, 1); +} + +static void do_kernel_range_flush(void *info) +{ + struct flush_tlb_info *f = info; + unsigned long addr; + + /* flush range by one by one 'invlpg' */ + for (addr = f->start; addr < f->end; addr += PAGE_SIZE) + flush_tlb_one_kernel(addr); +} + +void flush_tlb_kernel_range(unsigned long start, unsigned long end) +{ + /* Balance as user space task's flush, a bit conservative */ + if (end == TLB_FLUSH_ALL || + (end - start) > tlb_single_page_flush_ceiling << PAGE_SHIFT) { + on_each_cpu(do_flush_tlb_all, NULL, 1); + } else { + struct flush_tlb_info *info; + + preempt_disable(); + info = get_flush_tlb_info(NULL, start, end, 0, false, + TLB_GENERATION_INVALID); + + on_each_cpu(do_kernel_range_flush, info, 1); + + put_flush_tlb_info(); + preempt_enable(); + } +} + +/* + * This can be used from process context to figure out what the value of + * CR3 is without needing to do a (slow) __read_cr3(). + * + * It's intended to be used for code like KVM that sneakily changes CR3 + * and needs to restore it. It needs to be used very carefully. + */ +unsigned long __get_current_cr3_fast(void) +{ + unsigned long cr3 = + build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd, + this_cpu_read(cpu_tlbstate.loaded_mm_asid), + tlbstate_lam_cr3_mask()); + + /* For now, be very restrictive about when this can be called. */ + VM_WARN_ON(in_nmi() || preemptible()); + + VM_BUG_ON(cr3 != __read_cr3()); + return cr3; +} +EXPORT_SYMBOL_GPL(__get_current_cr3_fast); + +/* + * Flush one page in the kernel mapping + */ +void flush_tlb_one_kernel(unsigned long addr) +{ + count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE); + + /* + * If PTI is off, then __flush_tlb_one_user() is just INVLPG or its + * paravirt equivalent. Even with PCID, this is sufficient: we only + * use PCID if we also use global PTEs for the kernel mapping, and + * INVLPG flushes global translations across all address spaces. + * + * If PTI is on, then the kernel is mapped with non-global PTEs, and + * __flush_tlb_one_user() will flush the given address for the current + * kernel address space and for its usermode counterpart, but it does + * not flush it for other address spaces. + */ + flush_tlb_one_user(addr); + + if (!static_cpu_has(X86_FEATURE_PTI)) + return; + + /* + * See above. We need to propagate the flush to all other address + * spaces. In principle, we only need to propagate it to kernelmode + * address spaces, but the extra bookkeeping we would need is not + * worth it. + */ + this_cpu_write(cpu_tlbstate.invalidate_other, true); +} + +/* + * Flush one page in the user mapping + */ +STATIC_NOPV void native_flush_tlb_one_user(unsigned long addr) +{ + u32 loaded_mm_asid; + bool cpu_pcide; + + /* Flush 'addr' from the kernel PCID: */ + asm volatile("invlpg (%0)" ::"r" (addr) : "memory"); + + /* If PTI is off there is no user PCID and nothing to flush. */ + if (!static_cpu_has(X86_FEATURE_PTI)) + return; + + loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); + cpu_pcide = this_cpu_read(cpu_tlbstate.cr4) & X86_CR4_PCIDE; + + /* + * invpcid_flush_one(pcid>0) will #GP if CR4.PCIDE==0. Check + * 'cpu_pcide' to ensure that *this* CPU will not trigger those + * #GP's even if called before CR4.PCIDE has been initialized. + */ + if (boot_cpu_has(X86_FEATURE_INVPCID) && cpu_pcide) + invpcid_flush_one(user_pcid(loaded_mm_asid), addr); + else + invalidate_user_asid(loaded_mm_asid); +} + +void flush_tlb_one_user(unsigned long addr) +{ + __flush_tlb_one_user(addr); +} + +/* + * Flush everything + */ +STATIC_NOPV void native_flush_tlb_global(void) +{ + unsigned long flags; + + if (static_cpu_has(X86_FEATURE_INVPCID)) { + /* + * Using INVPCID is considerably faster than a pair of writes + * to CR4 sandwiched inside an IRQ flag save/restore. + * + * Note, this works with CR4.PCIDE=0 or 1. + */ + invpcid_flush_all(); + return; + } + + /* + * Read-modify-write to CR4 - protect it from preemption and + * from interrupts. (Use the raw variant because this code can + * be called from deep inside debugging code.) + */ + raw_local_irq_save(flags); + + __native_tlb_flush_global(this_cpu_read(cpu_tlbstate.cr4)); + + raw_local_irq_restore(flags); +} + +/* + * Flush the entire current user mapping + */ +STATIC_NOPV void native_flush_tlb_local(void) +{ + /* + * Preemption or interrupts must be disabled to protect the access + * to the per CPU variable and to prevent being preempted between + * read_cr3() and write_cr3(). + */ + WARN_ON_ONCE(preemptible()); + + invalidate_user_asid(this_cpu_read(cpu_tlbstate.loaded_mm_asid)); + + /* If current->mm == NULL then the read_cr3() "borrows" an mm */ + native_write_cr3(__native_read_cr3()); +} + +void flush_tlb_local(void) +{ + __flush_tlb_local(); +} + +/* + * Flush everything + */ +void __flush_tlb_all(void) +{ + /* + * This is to catch users with enabled preemption and the PGE feature + * and don't trigger the warning in __native_flush_tlb(). + */ + VM_WARN_ON_ONCE(preemptible()); + + if (cpu_feature_enabled(X86_FEATURE_PGE)) { + __flush_tlb_global(); + } else { + /* + * !PGE -> !PCID (setup_pcid()), thus every flush is total. + */ + flush_tlb_local(); + } +} +EXPORT_SYMBOL_GPL(__flush_tlb_all); + +void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch) +{ + struct flush_tlb_info *info; + + int cpu = get_cpu(); + + info = get_flush_tlb_info(NULL, 0, TLB_FLUSH_ALL, 0, false, + TLB_GENERATION_INVALID); + /* + * flush_tlb_multi() is not optimized for the common case in which only + * a local TLB flush is needed. Optimize this use-case by calling + * flush_tlb_func_local() directly in this case. + */ + if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) { + flush_tlb_multi(&batch->cpumask, info); + } else if (cpumask_test_cpu(cpu, &batch->cpumask)) { + lockdep_assert_irqs_enabled(); + local_irq_disable(); + flush_tlb_func(info); + local_irq_enable(); + } + + cpumask_clear(&batch->cpumask); + + put_flush_tlb_info(); + put_cpu(); +} + +/* + * Blindly accessing user memory from NMI context can be dangerous + * if we're in the middle of switching the current user task or + * switching the loaded mm. It can also be dangerous if we + * interrupted some kernel code that was temporarily using a + * different mm. + */ +bool nmi_uaccess_okay(void) +{ + struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm); + struct mm_struct *current_mm = current->mm; + + VM_WARN_ON_ONCE(!loaded_mm); + + /* + * The condition we want to check is + * current_mm->pgd == __va(read_cr3_pa()). This may be slow, though, + * if we're running in a VM with shadow paging, and nmi_uaccess_okay() + * is supposed to be reasonably fast. + * + * Instead, we check the almost equivalent but somewhat conservative + * condition below, and we rely on the fact that switch_mm_irqs_off() + * sets loaded_mm to LOADED_MM_SWITCHING before writing to CR3. + */ + if (loaded_mm != current_mm) + return false; + + VM_WARN_ON_ONCE(current_mm->pgd != __va(read_cr3_pa())); + + return true; +} + +static ssize_t tlbflush_read_file(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos) +{ + char buf[32]; + unsigned int len; + + len = sprintf(buf, "%ld\n", tlb_single_page_flush_ceiling); + return simple_read_from_buffer(user_buf, count, ppos, buf, len); +} + +static ssize_t tlbflush_write_file(struct file *file, + const char __user *user_buf, size_t count, loff_t *ppos) +{ + char buf[32]; + ssize_t len; + int ceiling; + + len = min(count, sizeof(buf) - 1); + if (copy_from_user(buf, user_buf, len)) + return -EFAULT; + + buf[len] = '\0'; + if (kstrtoint(buf, 0, &ceiling)) + return -EINVAL; + + if (ceiling < 0) + return -EINVAL; + + tlb_single_page_flush_ceiling = ceiling; + return count; +} + +static const struct file_operations fops_tlbflush = { + .read = tlbflush_read_file, + .write = tlbflush_write_file, + .llseek = default_llseek, +}; + +static int __init create_tlb_single_page_flush_ceiling(void) +{ + debugfs_create_file("tlb_single_page_flush_ceiling", S_IRUSR | S_IWUSR, + arch_debugfs_dir, NULL, &fops_tlbflush); + return 0; +} +late_initcall(create_tlb_single_page_flush_ceiling); |