diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-06 01:02:30 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-06 01:02:30 +0000 |
commit | 76cb841cb886eef6b3bee341a2266c76578724ad (patch) | |
tree | f5892e5ba6cc11949952a6ce4ecbe6d516d6ce58 /arch/x86/mm | |
parent | Initial commit. (diff) | |
download | linux-76cb841cb886eef6b3bee341a2266c76578724ad.tar.xz linux-76cb841cb886eef6b3bee341a2266c76578724ad.zip |
Adding upstream version 4.19.249.upstream/4.19.249
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'arch/x86/mm')
47 files changed, 21254 insertions, 0 deletions
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile new file mode 100644 index 000000000..4b101dd6e --- /dev/null +++ b/arch/x86/mm/Makefile @@ -0,0 +1,55 @@ +# SPDX-License-Identifier: GPL-2.0 +# Kernel does not boot with instrumentation of tlb.c and mem_encrypt*.c +KCOV_INSTRUMENT_tlb.o := n +KCOV_INSTRUMENT_mem_encrypt.o := n +KCOV_INSTRUMENT_mem_encrypt_identity.o := n + +KASAN_SANITIZE_mem_encrypt.o := n +KASAN_SANITIZE_mem_encrypt_identity.o := n + +ifdef CONFIG_FUNCTION_TRACER +CFLAGS_REMOVE_mem_encrypt.o = -pg +CFLAGS_REMOVE_mem_encrypt_identity.o = -pg +endif + +obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ + pat.o pgtable.o physaddr.o setup_nx.o tlb.o cpu_entry_area.o + +# Make sure __phys_addr has no stackprotector +nostackp := $(call cc-option, -fno-stack-protector) +CFLAGS_physaddr.o := $(nostackp) +CFLAGS_setup_nx.o := $(nostackp) +CFLAGS_mem_encrypt_identity.o := $(nostackp) + +CFLAGS_fault.o := -I$(src)/../include/asm/trace + +obj-$(CONFIG_X86_PAT) += pat_rbtree.o + +obj-$(CONFIG_X86_32) += pgtable_32.o iomap_32.o + +obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o +obj-$(CONFIG_X86_PTDUMP_CORE) += dump_pagetables.o +obj-$(CONFIG_X86_PTDUMP) += debug_pagetables.o + +obj-$(CONFIG_HIGHMEM) += highmem_32.o + +KASAN_SANITIZE_kasan_init_$(BITS).o := n +obj-$(CONFIG_KASAN) += kasan_init_$(BITS).o + +obj-$(CONFIG_MMIOTRACE) += mmiotrace.o +mmiotrace-y := kmmio.o pf_in.o mmio-mod.o +obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o + +obj-$(CONFIG_NUMA) += numa.o numa_$(BITS).o +obj-$(CONFIG_AMD_NUMA) += amdtopology.o +obj-$(CONFIG_ACPI_NUMA) += srat.o +obj-$(CONFIG_NUMA_EMU) += numa_emulation.o + +obj-$(CONFIG_X86_INTEL_MPX) += mpx.o +obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o +obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o +obj-$(CONFIG_PAGE_TABLE_ISOLATION) += pti.o + +obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt.o +obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt_identity.o +obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt_boot.o diff --git a/arch/x86/mm/amdtopology.c b/arch/x86/mm/amdtopology.c new file mode 100644 index 000000000..048c761d9 --- /dev/null +++ b/arch/x86/mm/amdtopology.c @@ -0,0 +1,184 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * AMD NUMA support. + * Discover the memory map and associated nodes. + * + * This version reads it directly from the AMD northbridge. + * + * Copyright 2002,2003 Andi Kleen, SuSE Labs. + */ +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/string.h> +#include <linux/nodemask.h> +#include <linux/memblock.h> +#include <linux/bootmem.h> + +#include <asm/io.h> +#include <linux/pci_ids.h> +#include <linux/acpi.h> +#include <asm/types.h> +#include <asm/mmzone.h> +#include <asm/proto.h> +#include <asm/e820/api.h> +#include <asm/pci-direct.h> +#include <asm/numa.h> +#include <asm/mpspec.h> +#include <asm/apic.h> +#include <asm/amd_nb.h> + +static unsigned char __initdata nodeids[8]; + +static __init int find_northbridge(void) +{ + int num; + + for (num = 0; num < 32; num++) { + u32 header; + + header = read_pci_config(0, num, 0, 0x00); + if (header != (PCI_VENDOR_ID_AMD | (0x1100<<16)) && + header != (PCI_VENDOR_ID_AMD | (0x1200<<16)) && + header != (PCI_VENDOR_ID_AMD | (0x1300<<16))) + continue; + + header = read_pci_config(0, num, 1, 0x00); + if (header != (PCI_VENDOR_ID_AMD | (0x1101<<16)) && + header != (PCI_VENDOR_ID_AMD | (0x1201<<16)) && + header != (PCI_VENDOR_ID_AMD | (0x1301<<16))) + continue; + return num; + } + + return -ENOENT; +} + +int __init amd_numa_init(void) +{ + u64 start = PFN_PHYS(0); + u64 end = PFN_PHYS(max_pfn); + unsigned numnodes; + u64 prevbase; + int i, j, nb; + u32 nodeid, reg; + unsigned int bits, cores, apicid_base; + + if (!early_pci_allowed()) + return -EINVAL; + + nb = find_northbridge(); + if (nb < 0) + return nb; + + pr_info("Scanning NUMA topology in Northbridge %d\n", nb); + + reg = read_pci_config(0, nb, 0, 0x60); + numnodes = ((reg >> 4) & 0xF) + 1; + if (numnodes <= 1) + return -ENOENT; + + pr_info("Number of physical nodes %d\n", numnodes); + + prevbase = 0; + for (i = 0; i < 8; i++) { + u64 base, limit; + + base = read_pci_config(0, nb, 1, 0x40 + i*8); + limit = read_pci_config(0, nb, 1, 0x44 + i*8); + + nodeids[i] = nodeid = limit & 7; + if ((base & 3) == 0) { + if (i < numnodes) + pr_info("Skipping disabled node %d\n", i); + continue; + } + if (nodeid >= numnodes) { + pr_info("Ignoring excess node %d (%Lx:%Lx)\n", nodeid, + base, limit); + continue; + } + + if (!limit) { + pr_info("Skipping node entry %d (base %Lx)\n", + i, base); + continue; + } + if ((base >> 8) & 3 || (limit >> 8) & 3) { + pr_err("Node %d using interleaving mode %Lx/%Lx\n", + nodeid, (base >> 8) & 3, (limit >> 8) & 3); + return -EINVAL; + } + if (node_isset(nodeid, numa_nodes_parsed)) { + pr_info("Node %d already present, skipping\n", + nodeid); + continue; + } + + limit >>= 16; + limit++; + limit <<= 24; + + if (limit > end) + limit = end; + if (limit <= base) + continue; + + base >>= 16; + base <<= 24; + + if (base < start) + base = start; + if (limit > end) + limit = end; + if (limit == base) { + pr_err("Empty node %d\n", nodeid); + continue; + } + if (limit < base) { + pr_err("Node %d bogus settings %Lx-%Lx.\n", + nodeid, base, limit); + continue; + } + + /* Could sort here, but pun for now. Should not happen anyroads. */ + if (prevbase > base) { + pr_err("Node map not sorted %Lx,%Lx\n", + prevbase, base); + return -EINVAL; + } + + pr_info("Node %d MemBase %016Lx Limit %016Lx\n", + nodeid, base, limit); + + prevbase = base; + numa_add_memblk(nodeid, base, limit); + node_set(nodeid, numa_nodes_parsed); + } + + if (!nodes_weight(numa_nodes_parsed)) + return -ENOENT; + + /* + * We seem to have valid NUMA configuration. Map apicids to nodes + * using the coreid bits from early_identify_cpu. + */ + bits = boot_cpu_data.x86_coreid_bits; + cores = 1 << bits; + apicid_base = 0; + + /* + * get boot-time SMP configuration: + */ + early_get_smp_config(); + + if (boot_cpu_physical_apicid > 0) { + pr_info("BSP APIC ID: %02x\n", boot_cpu_physical_apicid); + apicid_base = boot_cpu_physical_apicid; + } + + for_each_node_mask(i, numa_nodes_parsed) + for (j = apicid_base; j < cores + apicid_base; j++) + set_apicid_to_node((i << bits) + j, i); + + return 0; +} diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c new file mode 100644 index 000000000..076ebdce9 --- /dev/null +++ b/arch/x86/mm/cpu_entry_area.c @@ -0,0 +1,217 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/spinlock.h> +#include <linux/percpu.h> +#include <linux/kallsyms.h> +#include <linux/kcore.h> + +#include <asm/cpu_entry_area.h> +#include <asm/pgtable.h> +#include <asm/fixmap.h> +#include <asm/desc.h> + +static DEFINE_PER_CPU_PAGE_ALIGNED(struct entry_stack_page, entry_stack_storage); + +#ifdef CONFIG_X86_64 +static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks + [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]); +static DEFINE_PER_CPU(struct kcore_list, kcore_entry_trampoline); +#endif + +struct cpu_entry_area *get_cpu_entry_area(int cpu) +{ + unsigned long va = CPU_ENTRY_AREA_PER_CPU + cpu * CPU_ENTRY_AREA_SIZE; + BUILD_BUG_ON(sizeof(struct cpu_entry_area) % PAGE_SIZE != 0); + + return (struct cpu_entry_area *) va; +} +EXPORT_SYMBOL(get_cpu_entry_area); + +void cea_set_pte(void *cea_vaddr, phys_addr_t pa, pgprot_t flags) +{ + unsigned long va = (unsigned long) cea_vaddr; + pte_t pte = pfn_pte(pa >> PAGE_SHIFT, flags); + + /* + * The cpu_entry_area is shared between the user and kernel + * page tables. All of its ptes can safely be global. + * _PAGE_GLOBAL gets reused to help indicate PROT_NONE for + * non-present PTEs, so be careful not to set it in that + * case to avoid confusion. + */ + if (boot_cpu_has(X86_FEATURE_PGE) && + (pgprot_val(flags) & _PAGE_PRESENT)) + pte = pte_set_flags(pte, _PAGE_GLOBAL); + + set_pte_vaddr(va, pte); +} + +static void __init +cea_map_percpu_pages(void *cea_vaddr, void *ptr, int pages, pgprot_t prot) +{ + for ( ; pages; pages--, cea_vaddr+= PAGE_SIZE, ptr += PAGE_SIZE) + cea_set_pte(cea_vaddr, per_cpu_ptr_to_phys(ptr), prot); +} + +static void percpu_setup_debug_store(int cpu) +{ +#ifdef CONFIG_CPU_SUP_INTEL + int npages; + void *cea; + + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + return; + + cea = &get_cpu_entry_area(cpu)->cpu_debug_store; + npages = sizeof(struct debug_store) / PAGE_SIZE; + BUILD_BUG_ON(sizeof(struct debug_store) % PAGE_SIZE != 0); + cea_map_percpu_pages(cea, &per_cpu(cpu_debug_store, cpu), npages, + PAGE_KERNEL); + + cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers; + /* + * Force the population of PMDs for not yet allocated per cpu + * memory like debug store buffers. + */ + npages = sizeof(struct debug_store_buffers) / PAGE_SIZE; + for (; npages; npages--, cea += PAGE_SIZE) + cea_set_pte(cea, 0, PAGE_NONE); +#endif +} + +/* Setup the fixmap mappings only once per-processor */ +static void __init setup_cpu_entry_area(int cpu) +{ +#ifdef CONFIG_X86_64 + extern char _entry_trampoline[]; + + /* On 64-bit systems, we use a read-only fixmap GDT and TSS. */ + pgprot_t gdt_prot = PAGE_KERNEL_RO; + pgprot_t tss_prot = PAGE_KERNEL_RO; +#else + /* + * On native 32-bit systems, the GDT cannot be read-only because + * our double fault handler uses a task gate, and entering through + * a task gate needs to change an available TSS to busy. If the + * GDT is read-only, that will triple fault. The TSS cannot be + * read-only because the CPU writes to it on task switches. + * + * On Xen PV, the GDT must be read-only because the hypervisor + * requires it. + */ + pgprot_t gdt_prot = boot_cpu_has(X86_FEATURE_XENPV) ? + PAGE_KERNEL_RO : PAGE_KERNEL; + pgprot_t tss_prot = PAGE_KERNEL; +#endif + + cea_set_pte(&get_cpu_entry_area(cpu)->gdt, get_cpu_gdt_paddr(cpu), + gdt_prot); + + cea_map_percpu_pages(&get_cpu_entry_area(cpu)->entry_stack_page, + per_cpu_ptr(&entry_stack_storage, cpu), 1, + PAGE_KERNEL); + + /* + * The Intel SDM says (Volume 3, 7.2.1): + * + * Avoid placing a page boundary in the part of the TSS that the + * processor reads during a task switch (the first 104 bytes). The + * processor may not correctly perform address translations if a + * boundary occurs in this area. During a task switch, the processor + * reads and writes into the first 104 bytes of each TSS (using + * contiguous physical addresses beginning with the physical address + * of the first byte of the TSS). So, after TSS access begins, if + * part of the 104 bytes is not physically contiguous, the processor + * will access incorrect information without generating a page-fault + * exception. + * + * There are also a lot of errata involving the TSS spanning a page + * boundary. Assert that we're not doing that. + */ + BUILD_BUG_ON((offsetof(struct tss_struct, x86_tss) ^ + offsetofend(struct tss_struct, x86_tss)) & PAGE_MASK); + BUILD_BUG_ON(sizeof(struct tss_struct) % PAGE_SIZE != 0); + cea_map_percpu_pages(&get_cpu_entry_area(cpu)->tss, + &per_cpu(cpu_tss_rw, cpu), + sizeof(struct tss_struct) / PAGE_SIZE, tss_prot); + +#ifdef CONFIG_X86_32 + per_cpu(cpu_entry_area, cpu) = get_cpu_entry_area(cpu); +#endif + +#ifdef CONFIG_X86_64 + BUILD_BUG_ON(sizeof(exception_stacks) % PAGE_SIZE != 0); + BUILD_BUG_ON(sizeof(exception_stacks) != + sizeof(((struct cpu_entry_area *)0)->exception_stacks)); + cea_map_percpu_pages(&get_cpu_entry_area(cpu)->exception_stacks, + &per_cpu(exception_stacks, cpu), + sizeof(exception_stacks) / PAGE_SIZE, PAGE_KERNEL); + + cea_set_pte(&get_cpu_entry_area(cpu)->entry_trampoline, + __pa_symbol(_entry_trampoline), PAGE_KERNEL_RX); + /* + * The cpu_entry_area alias addresses are not in the kernel binary + * so they do not show up in /proc/kcore normally. This adds entries + * for them manually. + */ + kclist_add_remap(&per_cpu(kcore_entry_trampoline, cpu), + _entry_trampoline, + &get_cpu_entry_area(cpu)->entry_trampoline, PAGE_SIZE); +#endif + percpu_setup_debug_store(cpu); +} + +#ifdef CONFIG_X86_64 +int arch_get_kallsym(unsigned int symnum, unsigned long *value, char *type, + char *name) +{ + unsigned int cpu, ncpu = 0; + + if (symnum >= num_possible_cpus()) + return -EINVAL; + + for_each_possible_cpu(cpu) { + if (ncpu++ >= symnum) + break; + } + + *value = (unsigned long)&get_cpu_entry_area(cpu)->entry_trampoline; + *type = 't'; + strlcpy(name, "__entry_SYSCALL_64_trampoline", KSYM_NAME_LEN); + + return 0; +} +#endif + +static __init void setup_cpu_entry_area_ptes(void) +{ +#ifdef CONFIG_X86_32 + unsigned long start, end; + + BUILD_BUG_ON(CPU_ENTRY_AREA_PAGES * PAGE_SIZE < CPU_ENTRY_AREA_MAP_SIZE); + BUG_ON(CPU_ENTRY_AREA_BASE & ~PMD_MASK); + + start = CPU_ENTRY_AREA_BASE; + end = start + CPU_ENTRY_AREA_MAP_SIZE; + + /* Careful here: start + PMD_SIZE might wrap around */ + for (; start < end && start >= CPU_ENTRY_AREA_BASE; start += PMD_SIZE) + populate_extra_pte(start); +#endif +} + +void __init setup_cpu_entry_areas(void) +{ + unsigned int cpu; + + setup_cpu_entry_area_ptes(); + + for_each_possible_cpu(cpu) + setup_cpu_entry_area(cpu); + + /* + * This is the last essential update to swapper_pgdir which needs + * to be synchronized to initial_page_table on 32bit. + */ + sync_initial_page_table(); +} diff --git a/arch/x86/mm/debug_pagetables.c b/arch/x86/mm/debug_pagetables.c new file mode 100644 index 000000000..225fe2f0b --- /dev/null +++ b/arch/x86/mm/debug_pagetables.c @@ -0,0 +1,146 @@ +#include <linux/debugfs.h> +#include <linux/efi.h> +#include <linux/module.h> +#include <linux/seq_file.h> +#include <asm/pgtable.h> + +static int ptdump_show(struct seq_file *m, void *v) +{ + ptdump_walk_pgd_level_debugfs(m, NULL, false); + return 0; +} + +static int ptdump_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, ptdump_show, NULL); +} + +static const struct file_operations ptdump_fops = { + .owner = THIS_MODULE, + .open = ptdump_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int ptdump_show_curknl(struct seq_file *m, void *v) +{ + if (current->mm->pgd) { + down_read(¤t->mm->mmap_sem); + ptdump_walk_pgd_level_debugfs(m, current->mm->pgd, false); + up_read(¤t->mm->mmap_sem); + } + return 0; +} + +static int ptdump_open_curknl(struct inode *inode, struct file *filp) +{ + return single_open(filp, ptdump_show_curknl, NULL); +} + +static const struct file_operations ptdump_curknl_fops = { + .owner = THIS_MODULE, + .open = ptdump_open_curknl, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +#ifdef CONFIG_PAGE_TABLE_ISOLATION +static struct dentry *pe_curusr; + +static int ptdump_show_curusr(struct seq_file *m, void *v) +{ + if (current->mm->pgd) { + down_read(¤t->mm->mmap_sem); + ptdump_walk_pgd_level_debugfs(m, current->mm->pgd, true); + up_read(¤t->mm->mmap_sem); + } + return 0; +} + +static int ptdump_open_curusr(struct inode *inode, struct file *filp) +{ + return single_open(filp, ptdump_show_curusr, NULL); +} + +static const struct file_operations ptdump_curusr_fops = { + .owner = THIS_MODULE, + .open = ptdump_open_curusr, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; +#endif + +#if defined(CONFIG_EFI) && defined(CONFIG_X86_64) +static struct dentry *pe_efi; + +static int ptdump_show_efi(struct seq_file *m, void *v) +{ + if (efi_mm.pgd) + ptdump_walk_pgd_level_debugfs(m, efi_mm.pgd, false); + return 0; +} + +static int ptdump_open_efi(struct inode *inode, struct file *filp) +{ + return single_open(filp, ptdump_show_efi, NULL); +} + +static const struct file_operations ptdump_efi_fops = { + .owner = THIS_MODULE, + .open = ptdump_open_efi, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; +#endif + +static struct dentry *dir, *pe_knl, *pe_curknl; + +static int __init pt_dump_debug_init(void) +{ + dir = debugfs_create_dir("page_tables", NULL); + if (!dir) + return -ENOMEM; + + pe_knl = debugfs_create_file("kernel", 0400, dir, NULL, + &ptdump_fops); + if (!pe_knl) + goto err; + + pe_curknl = debugfs_create_file("current_kernel", 0400, + dir, NULL, &ptdump_curknl_fops); + if (!pe_curknl) + goto err; + +#ifdef CONFIG_PAGE_TABLE_ISOLATION + pe_curusr = debugfs_create_file("current_user", 0400, + dir, NULL, &ptdump_curusr_fops); + if (!pe_curusr) + goto err; +#endif + +#if defined(CONFIG_EFI) && defined(CONFIG_X86_64) + pe_efi = debugfs_create_file("efi", 0400, dir, NULL, &ptdump_efi_fops); + if (!pe_efi) + goto err; +#endif + + return 0; +err: + debugfs_remove_recursive(dir); + return -ENOMEM; +} + +static void __exit pt_dump_debug_exit(void) +{ + debugfs_remove_recursive(dir); +} + +module_init(pt_dump_debug_init); +module_exit(pt_dump_debug_exit); +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>"); +MODULE_DESCRIPTION("Kernel debugging helper that dumps pagetables"); diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c new file mode 100644 index 000000000..abcb8d00b --- /dev/null +++ b/arch/x86/mm/dump_pagetables.c @@ -0,0 +1,640 @@ +/* + * Debug helper to dump the current kernel pagetables of the system + * so that we can see what the various memory ranges are set to. + * + * (C) Copyright 2008 Intel Corporation + * + * Author: Arjan van de Ven <arjan@linux.intel.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ + +#include <linux/debugfs.h> +#include <linux/kasan.h> +#include <linux/mm.h> +#include <linux/init.h> +#include <linux/sched.h> +#include <linux/seq_file.h> +#include <linux/highmem.h> +#include <linux/pci.h> + +#include <asm/e820/types.h> +#include <asm/pgtable.h> + +/* + * The dumper groups pagetable entries of the same type into one, and for + * that it needs to keep some state when walking, and flush this state + * when a "break" in the continuity is found. + */ +struct pg_state { + int level; + pgprot_t current_prot; + pgprotval_t effective_prot; + unsigned long start_address; + unsigned long current_address; + const struct addr_marker *marker; + unsigned long lines; + bool to_dmesg; + bool check_wx; + unsigned long wx_pages; +}; + +struct addr_marker { + unsigned long start_address; + const char *name; + unsigned long max_lines; +}; + +/* Address space markers hints */ + +#ifdef CONFIG_X86_64 + +enum address_markers_idx { + USER_SPACE_NR = 0, + KERNEL_SPACE_NR, +#ifdef CONFIG_MODIFY_LDT_SYSCALL + LDT_NR, +#endif + LOW_KERNEL_NR, + VMALLOC_START_NR, + VMEMMAP_START_NR, +#ifdef CONFIG_KASAN + KASAN_SHADOW_START_NR, + KASAN_SHADOW_END_NR, +#endif + CPU_ENTRY_AREA_NR, +#ifdef CONFIG_X86_ESPFIX64 + ESPFIX_START_NR, +#endif +#ifdef CONFIG_EFI + EFI_END_NR, +#endif + HIGH_KERNEL_NR, + MODULES_VADDR_NR, + MODULES_END_NR, + FIXADDR_START_NR, + END_OF_SPACE_NR, +}; + +static struct addr_marker address_markers[] = { + [USER_SPACE_NR] = { 0, "User Space" }, + [KERNEL_SPACE_NR] = { (1UL << 63), "Kernel Space" }, + [LOW_KERNEL_NR] = { 0UL, "Low Kernel Mapping" }, + [VMALLOC_START_NR] = { 0UL, "vmalloc() Area" }, + [VMEMMAP_START_NR] = { 0UL, "Vmemmap" }, +#ifdef CONFIG_KASAN + /* + * These fields get initialized with the (dynamic) + * KASAN_SHADOW_{START,END} values in pt_dump_init(). + */ + [KASAN_SHADOW_START_NR] = { 0UL, "KASAN shadow" }, + [KASAN_SHADOW_END_NR] = { 0UL, "KASAN shadow end" }, +#endif +#ifdef CONFIG_MODIFY_LDT_SYSCALL + [LDT_NR] = { 0UL, "LDT remap" }, +#endif + [CPU_ENTRY_AREA_NR] = { CPU_ENTRY_AREA_BASE,"CPU entry Area" }, +#ifdef CONFIG_X86_ESPFIX64 + [ESPFIX_START_NR] = { ESPFIX_BASE_ADDR, "ESPfix Area", 16 }, +#endif +#ifdef CONFIG_EFI + [EFI_END_NR] = { EFI_VA_END, "EFI Runtime Services" }, +#endif + [HIGH_KERNEL_NR] = { __START_KERNEL_map, "High Kernel Mapping" }, + [MODULES_VADDR_NR] = { MODULES_VADDR, "Modules" }, + [MODULES_END_NR] = { MODULES_END, "End Modules" }, + [FIXADDR_START_NR] = { FIXADDR_START, "Fixmap Area" }, + [END_OF_SPACE_NR] = { -1, NULL } +}; + +#define INIT_PGD ((pgd_t *) &init_top_pgt) + +#else /* CONFIG_X86_64 */ + +enum address_markers_idx { + USER_SPACE_NR = 0, + KERNEL_SPACE_NR, + VMALLOC_START_NR, + VMALLOC_END_NR, +#ifdef CONFIG_HIGHMEM + PKMAP_BASE_NR, +#endif +#ifdef CONFIG_MODIFY_LDT_SYSCALL + LDT_NR, +#endif + CPU_ENTRY_AREA_NR, + FIXADDR_START_NR, + END_OF_SPACE_NR, +}; + +static struct addr_marker address_markers[] = { + [USER_SPACE_NR] = { 0, "User Space" }, + [KERNEL_SPACE_NR] = { PAGE_OFFSET, "Kernel Mapping" }, + [VMALLOC_START_NR] = { 0UL, "vmalloc() Area" }, + [VMALLOC_END_NR] = { 0UL, "vmalloc() End" }, +#ifdef CONFIG_HIGHMEM + [PKMAP_BASE_NR] = { 0UL, "Persistent kmap() Area" }, +#endif +#ifdef CONFIG_MODIFY_LDT_SYSCALL + [LDT_NR] = { 0UL, "LDT remap" }, +#endif + [CPU_ENTRY_AREA_NR] = { 0UL, "CPU entry area" }, + [FIXADDR_START_NR] = { 0UL, "Fixmap area" }, + [END_OF_SPACE_NR] = { -1, NULL } +}; + +#define INIT_PGD (swapper_pg_dir) + +#endif /* !CONFIG_X86_64 */ + +/* Multipliers for offsets within the PTEs */ +#define PTE_LEVEL_MULT (PAGE_SIZE) +#define PMD_LEVEL_MULT (PTRS_PER_PTE * PTE_LEVEL_MULT) +#define PUD_LEVEL_MULT (PTRS_PER_PMD * PMD_LEVEL_MULT) +#define P4D_LEVEL_MULT (PTRS_PER_PUD * PUD_LEVEL_MULT) +#define PGD_LEVEL_MULT (PTRS_PER_P4D * P4D_LEVEL_MULT) + +#define pt_dump_seq_printf(m, to_dmesg, fmt, args...) \ +({ \ + if (to_dmesg) \ + printk(KERN_INFO fmt, ##args); \ + else \ + if (m) \ + seq_printf(m, fmt, ##args); \ +}) + +#define pt_dump_cont_printf(m, to_dmesg, fmt, args...) \ +({ \ + if (to_dmesg) \ + printk(KERN_CONT fmt, ##args); \ + else \ + if (m) \ + seq_printf(m, fmt, ##args); \ +}) + +/* + * Print a readable form of a pgprot_t to the seq_file + */ +static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg) +{ + pgprotval_t pr = pgprot_val(prot); + static const char * const level_name[] = + { "cr3", "pgd", "p4d", "pud", "pmd", "pte" }; + + if (!(pr & _PAGE_PRESENT)) { + /* Not present */ + pt_dump_cont_printf(m, dmsg, " "); + } else { + if (pr & _PAGE_USER) + pt_dump_cont_printf(m, dmsg, "USR "); + else + pt_dump_cont_printf(m, dmsg, " "); + if (pr & _PAGE_RW) + pt_dump_cont_printf(m, dmsg, "RW "); + else + pt_dump_cont_printf(m, dmsg, "ro "); + if (pr & _PAGE_PWT) + pt_dump_cont_printf(m, dmsg, "PWT "); + else + pt_dump_cont_printf(m, dmsg, " "); + if (pr & _PAGE_PCD) + pt_dump_cont_printf(m, dmsg, "PCD "); + else + pt_dump_cont_printf(m, dmsg, " "); + + /* Bit 7 has a different meaning on level 3 vs 4 */ + if (level <= 4 && pr & _PAGE_PSE) + pt_dump_cont_printf(m, dmsg, "PSE "); + else + pt_dump_cont_printf(m, dmsg, " "); + if ((level == 5 && pr & _PAGE_PAT) || + ((level == 4 || level == 3) && pr & _PAGE_PAT_LARGE)) + pt_dump_cont_printf(m, dmsg, "PAT "); + else + pt_dump_cont_printf(m, dmsg, " "); + if (pr & _PAGE_GLOBAL) + pt_dump_cont_printf(m, dmsg, "GLB "); + else + pt_dump_cont_printf(m, dmsg, " "); + if (pr & _PAGE_NX) + pt_dump_cont_printf(m, dmsg, "NX "); + else + pt_dump_cont_printf(m, dmsg, "x "); + } + pt_dump_cont_printf(m, dmsg, "%s\n", level_name[level]); +} + +/* + * On 64 bits, sign-extend the 48 bit address to 64 bit + */ +static unsigned long normalize_addr(unsigned long u) +{ + int shift; + if (!IS_ENABLED(CONFIG_X86_64)) + return u; + + shift = 64 - (__VIRTUAL_MASK_SHIFT + 1); + return (signed long)(u << shift) >> shift; +} + +static void note_wx(struct pg_state *st) +{ + unsigned long npages; + + npages = (st->current_address - st->start_address) / PAGE_SIZE; + +#ifdef CONFIG_PCI_BIOS + /* + * If PCI BIOS is enabled, the PCI BIOS area is forced to WX. + * Inform about it, but avoid the warning. + */ + if (pcibios_enabled && st->start_address >= PAGE_OFFSET + BIOS_BEGIN && + st->current_address <= PAGE_OFFSET + BIOS_END) { + pr_warn_once("x86/mm: PCI BIOS W+X mapping %lu pages\n", npages); + return; + } +#endif + /* Account the WX pages */ + st->wx_pages += npages; + WARN_ONCE(1, "x86/mm: Found insecure W+X mapping at address %pS\n", + (void *)st->start_address); +} + +/* + * This function gets called on a break in a continuous series + * of PTE entries; the next one is different so we need to + * print what we collected so far. + */ +static void note_page(struct seq_file *m, struct pg_state *st, + pgprot_t new_prot, pgprotval_t new_eff, int level) +{ + pgprotval_t prot, cur, eff; + static const char units[] = "BKMGTPE"; + + /* + * If we have a "break" in the series, we need to flush the state that + * we have now. "break" is either changing perms, levels or + * address space marker. + */ + prot = pgprot_val(new_prot); + cur = pgprot_val(st->current_prot); + eff = st->effective_prot; + + if (!st->level) { + /* First entry */ + st->current_prot = new_prot; + st->effective_prot = new_eff; + st->level = level; + st->marker = address_markers; + st->lines = 0; + pt_dump_seq_printf(m, st->to_dmesg, "---[ %s ]---\n", + st->marker->name); + } else if (prot != cur || new_eff != eff || level != st->level || + st->current_address >= st->marker[1].start_address) { + const char *unit = units; + unsigned long delta; + int width = sizeof(unsigned long) * 2; + + if (st->check_wx && (eff & _PAGE_RW) && !(eff & _PAGE_NX)) + note_wx(st); + + /* + * Now print the actual finished series + */ + if (!st->marker->max_lines || + st->lines < st->marker->max_lines) { + pt_dump_seq_printf(m, st->to_dmesg, + "0x%0*lx-0x%0*lx ", + width, st->start_address, + width, st->current_address); + + delta = st->current_address - st->start_address; + while (!(delta & 1023) && unit[1]) { + delta >>= 10; + unit++; + } + pt_dump_cont_printf(m, st->to_dmesg, "%9lu%c ", + delta, *unit); + printk_prot(m, st->current_prot, st->level, + st->to_dmesg); + } + st->lines++; + + /* + * We print markers for special areas of address space, + * such as the start of vmalloc space etc. + * This helps in the interpretation. + */ + if (st->current_address >= st->marker[1].start_address) { + if (st->marker->max_lines && + st->lines > st->marker->max_lines) { + unsigned long nskip = + st->lines - st->marker->max_lines; + pt_dump_seq_printf(m, st->to_dmesg, + "... %lu entr%s skipped ... \n", + nskip, + nskip == 1 ? "y" : "ies"); + } + st->marker++; + st->lines = 0; + pt_dump_seq_printf(m, st->to_dmesg, "---[ %s ]---\n", + st->marker->name); + } + + st->start_address = st->current_address; + st->current_prot = new_prot; + st->effective_prot = new_eff; + st->level = level; + } +} + +static inline pgprotval_t effective_prot(pgprotval_t prot1, pgprotval_t prot2) +{ + return (prot1 & prot2 & (_PAGE_USER | _PAGE_RW)) | + ((prot1 | prot2) & _PAGE_NX); +} + +static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr, + pgprotval_t eff_in, unsigned long P) +{ + int i; + pte_t *pte; + pgprotval_t prot, eff; + + for (i = 0; i < PTRS_PER_PTE; i++) { + st->current_address = normalize_addr(P + i * PTE_LEVEL_MULT); + pte = pte_offset_map(&addr, st->current_address); + prot = pte_flags(*pte); + eff = effective_prot(eff_in, prot); + note_page(m, st, __pgprot(prot), eff, 5); + pte_unmap(pte); + } +} +#ifdef CONFIG_KASAN + +/* + * This is an optimization for KASAN=y case. Since all kasan page tables + * eventually point to the kasan_zero_page we could call note_page() + * right away without walking through lower level page tables. This saves + * us dozens of seconds (minutes for 5-level config) while checking for + * W+X mapping or reading kernel_page_tables debugfs file. + */ +static inline bool kasan_page_table(struct seq_file *m, struct pg_state *st, + void *pt) +{ + if (__pa(pt) == __pa(kasan_zero_pmd) || + (pgtable_l5_enabled() && __pa(pt) == __pa(kasan_zero_p4d)) || + __pa(pt) == __pa(kasan_zero_pud)) { + pgprotval_t prot = pte_flags(kasan_zero_pte[0]); + note_page(m, st, __pgprot(prot), 0, 5); + return true; + } + return false; +} +#else +static inline bool kasan_page_table(struct seq_file *m, struct pg_state *st, + void *pt) +{ + return false; +} +#endif + +#if PTRS_PER_PMD > 1 + +static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr, + pgprotval_t eff_in, unsigned long P) +{ + int i; + pmd_t *start, *pmd_start; + pgprotval_t prot, eff; + + pmd_start = start = (pmd_t *)pud_page_vaddr(addr); + for (i = 0; i < PTRS_PER_PMD; i++) { + st->current_address = normalize_addr(P + i * PMD_LEVEL_MULT); + if (!pmd_none(*start)) { + prot = pmd_flags(*start); + eff = effective_prot(eff_in, prot); + if (pmd_large(*start) || !pmd_present(*start)) { + note_page(m, st, __pgprot(prot), eff, 4); + } else if (!kasan_page_table(m, st, pmd_start)) { + walk_pte_level(m, st, *start, eff, + P + i * PMD_LEVEL_MULT); + } + } else + note_page(m, st, __pgprot(0), 0, 4); + start++; + } +} + +#else +#define walk_pmd_level(m,s,a,e,p) walk_pte_level(m,s,__pmd(pud_val(a)),e,p) +#define pud_large(a) pmd_large(__pmd(pud_val(a))) +#define pud_none(a) pmd_none(__pmd(pud_val(a))) +#endif + +#if PTRS_PER_PUD > 1 + +static void walk_pud_level(struct seq_file *m, struct pg_state *st, p4d_t addr, + pgprotval_t eff_in, unsigned long P) +{ + int i; + pud_t *start, *pud_start; + pgprotval_t prot, eff; + pud_t *prev_pud = NULL; + + pud_start = start = (pud_t *)p4d_page_vaddr(addr); + + for (i = 0; i < PTRS_PER_PUD; i++) { + st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT); + if (!pud_none(*start)) { + prot = pud_flags(*start); + eff = effective_prot(eff_in, prot); + if (pud_large(*start) || !pud_present(*start)) { + note_page(m, st, __pgprot(prot), eff, 3); + } else if (!kasan_page_table(m, st, pud_start)) { + walk_pmd_level(m, st, *start, eff, + P + i * PUD_LEVEL_MULT); + } + } else + note_page(m, st, __pgprot(0), 0, 3); + + prev_pud = start; + start++; + } +} + +#else +#define walk_pud_level(m,s,a,e,p) walk_pmd_level(m,s,__pud(p4d_val(a)),e,p) +#define p4d_large(a) pud_large(__pud(p4d_val(a))) +#define p4d_none(a) pud_none(__pud(p4d_val(a))) +#endif + +static void walk_p4d_level(struct seq_file *m, struct pg_state *st, pgd_t addr, + pgprotval_t eff_in, unsigned long P) +{ + int i; + p4d_t *start, *p4d_start; + pgprotval_t prot, eff; + + if (PTRS_PER_P4D == 1) + return walk_pud_level(m, st, __p4d(pgd_val(addr)), eff_in, P); + + p4d_start = start = (p4d_t *)pgd_page_vaddr(addr); + + for (i = 0; i < PTRS_PER_P4D; i++) { + st->current_address = normalize_addr(P + i * P4D_LEVEL_MULT); + if (!p4d_none(*start)) { + prot = p4d_flags(*start); + eff = effective_prot(eff_in, prot); + if (p4d_large(*start) || !p4d_present(*start)) { + note_page(m, st, __pgprot(prot), eff, 2); + } else if (!kasan_page_table(m, st, p4d_start)) { + walk_pud_level(m, st, *start, eff, + P + i * P4D_LEVEL_MULT); + } + } else + note_page(m, st, __pgprot(0), 0, 2); + + start++; + } +} + +#define pgd_large(a) (pgtable_l5_enabled() ? pgd_large(a) : p4d_large(__p4d(pgd_val(a)))) +#define pgd_none(a) (pgtable_l5_enabled() ? pgd_none(a) : p4d_none(__p4d(pgd_val(a)))) + +static inline bool is_hypervisor_range(int idx) +{ +#ifdef CONFIG_X86_64 + /* + * A hole in the beginning of kernel address space reserved + * for a hypervisor. + */ + return (idx >= pgd_index(GUARD_HOLE_BASE_ADDR)) && + (idx < pgd_index(GUARD_HOLE_END_ADDR)); +#else + return false; +#endif +} + +static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd, + bool checkwx, bool dmesg) +{ + pgd_t *start = INIT_PGD; + pgprotval_t prot, eff; + int i; + struct pg_state st = {}; + + if (pgd) { + start = pgd; + st.to_dmesg = dmesg; + } + + st.check_wx = checkwx; + if (checkwx) + st.wx_pages = 0; + + for (i = 0; i < PTRS_PER_PGD; i++) { + st.current_address = normalize_addr(i * PGD_LEVEL_MULT); + if (!pgd_none(*start) && !is_hypervisor_range(i)) { + prot = pgd_flags(*start); +#ifdef CONFIG_X86_PAE + eff = _PAGE_USER | _PAGE_RW; +#else + eff = prot; +#endif + if (pgd_large(*start) || !pgd_present(*start)) { + note_page(m, &st, __pgprot(prot), eff, 1); + } else { + walk_p4d_level(m, &st, *start, eff, + i * PGD_LEVEL_MULT); + } + } else + note_page(m, &st, __pgprot(0), 0, 1); + + cond_resched(); + start++; + } + + /* Flush out the last page */ + st.current_address = normalize_addr(PTRS_PER_PGD*PGD_LEVEL_MULT); + note_page(m, &st, __pgprot(0), 0, 0); + if (!checkwx) + return; + if (st.wx_pages) + pr_info("x86/mm: Checked W+X mappings: FAILED, %lu W+X pages found.\n", + st.wx_pages); + else + pr_info("x86/mm: Checked W+X mappings: passed, no W+X pages found.\n"); +} + +void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd) +{ + ptdump_walk_pgd_level_core(m, pgd, false, true); +} + +void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd, bool user) +{ +#ifdef CONFIG_PAGE_TABLE_ISOLATION + if (user && static_cpu_has(X86_FEATURE_PTI)) + pgd = kernel_to_user_pgdp(pgd); +#endif + ptdump_walk_pgd_level_core(m, pgd, false, false); +} +EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level_debugfs); + +void ptdump_walk_user_pgd_level_checkwx(void) +{ +#ifdef CONFIG_PAGE_TABLE_ISOLATION + pgd_t *pgd = INIT_PGD; + + if (!(__supported_pte_mask & _PAGE_NX) || + !static_cpu_has(X86_FEATURE_PTI)) + return; + + pr_info("x86/mm: Checking user space page tables\n"); + pgd = kernel_to_user_pgdp(pgd); + ptdump_walk_pgd_level_core(NULL, pgd, true, false); +#endif +} + +void ptdump_walk_pgd_level_checkwx(void) +{ + ptdump_walk_pgd_level_core(NULL, NULL, true, false); +} + +static int __init pt_dump_init(void) +{ + /* + * Various markers are not compile-time constants, so assign them + * here. + */ +#ifdef CONFIG_X86_64 + address_markers[LOW_KERNEL_NR].start_address = PAGE_OFFSET; + address_markers[VMALLOC_START_NR].start_address = VMALLOC_START; + address_markers[VMEMMAP_START_NR].start_address = VMEMMAP_START; +#ifdef CONFIG_MODIFY_LDT_SYSCALL + address_markers[LDT_NR].start_address = LDT_BASE_ADDR; +#endif +#ifdef CONFIG_KASAN + address_markers[KASAN_SHADOW_START_NR].start_address = KASAN_SHADOW_START; + address_markers[KASAN_SHADOW_END_NR].start_address = KASAN_SHADOW_END; +#endif +#endif +#ifdef CONFIG_X86_32 + address_markers[VMALLOC_START_NR].start_address = VMALLOC_START; + address_markers[VMALLOC_END_NR].start_address = VMALLOC_END; +# ifdef CONFIG_HIGHMEM + address_markers[PKMAP_BASE_NR].start_address = PKMAP_BASE; +# endif + address_markers[FIXADDR_START_NR].start_address = FIXADDR_START; + address_markers[CPU_ENTRY_AREA_NR].start_address = CPU_ENTRY_AREA_BASE; +# ifdef CONFIG_MODIFY_LDT_SYSCALL + address_markers[LDT_NR].start_address = LDT_BASE_ADDR; +# endif +#endif + return 0; +} +__initcall(pt_dump_init); diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c new file mode 100644 index 000000000..45f5d6cf6 --- /dev/null +++ b/arch/x86/mm/extable.c @@ -0,0 +1,251 @@ +#include <linux/extable.h> +#include <linux/uaccess.h> +#include <linux/sched/debug.h> +#include <xen/xen.h> + +#include <asm/fpu/internal.h> +#include <asm/traps.h> +#include <asm/kdebug.h> + +typedef bool (*ex_handler_t)(const struct exception_table_entry *, + struct pt_regs *, int); + +static inline unsigned long +ex_fixup_addr(const struct exception_table_entry *x) +{ + return (unsigned long)&x->fixup + x->fixup; +} +static inline ex_handler_t +ex_fixup_handler(const struct exception_table_entry *x) +{ + return (ex_handler_t)((unsigned long)&x->handler + x->handler); +} + +__visible bool ex_handler_default(const struct exception_table_entry *fixup, + struct pt_regs *regs, int trapnr) +{ + regs->ip = ex_fixup_addr(fixup); + return true; +} +EXPORT_SYMBOL(ex_handler_default); + +__visible bool ex_handler_fault(const struct exception_table_entry *fixup, + struct pt_regs *regs, int trapnr) +{ + regs->ip = ex_fixup_addr(fixup); + regs->ax = trapnr; + return true; +} +EXPORT_SYMBOL_GPL(ex_handler_fault); + +/* + * Handler for UD0 exception following a failed test against the + * result of a refcount inc/dec/add/sub. + */ +__visible bool ex_handler_refcount(const struct exception_table_entry *fixup, + struct pt_regs *regs, int trapnr) +{ + /* First unconditionally saturate the refcount. */ + *(int *)regs->cx = INT_MIN / 2; + + /* + * Strictly speaking, this reports the fixup destination, not + * the fault location, and not the actually overflowing + * instruction, which is the instruction before the "js", but + * since that instruction could be a variety of lengths, just + * report the location after the overflow, which should be close + * enough for finding the overflow, as it's at least back in + * the function, having returned from .text.unlikely. + */ + regs->ip = ex_fixup_addr(fixup); + + /* + * This function has been called because either a negative refcount + * value was seen by any of the refcount functions, or a zero + * refcount value was seen by refcount_dec(). + * + * If we crossed from INT_MAX to INT_MIN, OF (Overflow Flag: result + * wrapped around) will be set. Additionally, seeing the refcount + * reach 0 will set ZF (Zero Flag: result was zero). In each of + * these cases we want a report, since it's a boundary condition. + * The SF case is not reported since it indicates post-boundary + * manipulations below zero or above INT_MAX. And if none of the + * flags are set, something has gone very wrong, so report it. + */ + if (regs->flags & (X86_EFLAGS_OF | X86_EFLAGS_ZF)) { + bool zero = regs->flags & X86_EFLAGS_ZF; + + refcount_error_report(regs, zero ? "hit zero" : "overflow"); + } else if ((regs->flags & X86_EFLAGS_SF) == 0) { + /* Report if none of OF, ZF, nor SF are set. */ + refcount_error_report(regs, "unexpected saturation"); + } + + return true; +} +EXPORT_SYMBOL(ex_handler_refcount); + +/* + * Handler for when we fail to restore a task's FPU state. We should never get + * here because the FPU state of a task using the FPU (task->thread.fpu.state) + * should always be valid. However, past bugs have allowed userspace to set + * reserved bits in the XSAVE area using PTRACE_SETREGSET or sys_rt_sigreturn(). + * These caused XRSTOR to fail when switching to the task, leaking the FPU + * registers of the task previously executing on the CPU. Mitigate this class + * of vulnerability by restoring from the initial state (essentially, zeroing + * out all the FPU registers) if we can't restore from the task's FPU state. + */ +__visible bool ex_handler_fprestore(const struct exception_table_entry *fixup, + struct pt_regs *regs, int trapnr) +{ + regs->ip = ex_fixup_addr(fixup); + + WARN_ONCE(1, "Bad FPU state detected at %pB, reinitializing FPU registers.", + (void *)instruction_pointer(regs)); + + __copy_kernel_to_fpregs(&init_fpstate, -1); + return true; +} +EXPORT_SYMBOL_GPL(ex_handler_fprestore); + +__visible bool ex_handler_ext(const struct exception_table_entry *fixup, + struct pt_regs *regs, int trapnr) +{ + /* Special hack for uaccess_err */ + current->thread.uaccess_err = 1; + regs->ip = ex_fixup_addr(fixup); + return true; +} +EXPORT_SYMBOL(ex_handler_ext); + +__visible bool ex_handler_rdmsr_unsafe(const struct exception_table_entry *fixup, + struct pt_regs *regs, int trapnr) +{ + if (pr_warn_once("unchecked MSR access error: RDMSR from 0x%x at rIP: 0x%lx (%pF)\n", + (unsigned int)regs->cx, regs->ip, (void *)regs->ip)) + show_stack_regs(regs); + + /* Pretend that the read succeeded and returned 0. */ + regs->ip = ex_fixup_addr(fixup); + regs->ax = 0; + regs->dx = 0; + return true; +} +EXPORT_SYMBOL(ex_handler_rdmsr_unsafe); + +__visible bool ex_handler_wrmsr_unsafe(const struct exception_table_entry *fixup, + struct pt_regs *regs, int trapnr) +{ + if (pr_warn_once("unchecked MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x) at rIP: 0x%lx (%pF)\n", + (unsigned int)regs->cx, (unsigned int)regs->dx, + (unsigned int)regs->ax, regs->ip, (void *)regs->ip)) + show_stack_regs(regs); + + /* Pretend that the write succeeded. */ + regs->ip = ex_fixup_addr(fixup); + return true; +} +EXPORT_SYMBOL(ex_handler_wrmsr_unsafe); + +__visible bool ex_handler_clear_fs(const struct exception_table_entry *fixup, + struct pt_regs *regs, int trapnr) +{ + if (static_cpu_has(X86_BUG_NULL_SEG)) + asm volatile ("mov %0, %%fs" : : "rm" (__USER_DS)); + asm volatile ("mov %0, %%fs" : : "rm" (0)); + return ex_handler_default(fixup, regs, trapnr); +} +EXPORT_SYMBOL(ex_handler_clear_fs); + +__visible bool ex_has_fault_handler(unsigned long ip) +{ + const struct exception_table_entry *e; + ex_handler_t handler; + + e = search_exception_tables(ip); + if (!e) + return false; + handler = ex_fixup_handler(e); + + return handler == ex_handler_fault; +} + +int fixup_exception(struct pt_regs *regs, int trapnr) +{ + const struct exception_table_entry *e; + ex_handler_t handler; + +#ifdef CONFIG_PNPBIOS + if (unlikely(SEGMENT_IS_PNP_CODE(regs->cs))) { + extern u32 pnp_bios_fault_eip, pnp_bios_fault_esp; + extern u32 pnp_bios_is_utter_crap; + pnp_bios_is_utter_crap = 1; + printk(KERN_CRIT "PNPBIOS fault.. attempting recovery.\n"); + __asm__ volatile( + "movl %0, %%esp\n\t" + "jmp *%1\n\t" + : : "g" (pnp_bios_fault_esp), "g" (pnp_bios_fault_eip)); + panic("do_trap: can't hit this"); + } +#endif + + e = search_exception_tables(regs->ip); + if (!e) + return 0; + + handler = ex_fixup_handler(e); + return handler(e, regs, trapnr); +} + +extern unsigned int early_recursion_flag; + +/* Restricted version used during very early boot */ +void __init early_fixup_exception(struct pt_regs *regs, int trapnr) +{ + /* Ignore early NMIs. */ + if (trapnr == X86_TRAP_NMI) + return; + + if (early_recursion_flag > 2) + goto halt_loop; + + /* + * Old CPUs leave the high bits of CS on the stack + * undefined. I'm not sure which CPUs do this, but at least + * the 486 DX works this way. + * Xen pv domains are not using the default __KERNEL_CS. + */ + if (!xen_pv_domain() && regs->cs != __KERNEL_CS) + goto fail; + + /* + * The full exception fixup machinery is available as soon as + * the early IDT is loaded. This means that it is the + * responsibility of extable users to either function correctly + * when handlers are invoked early or to simply avoid causing + * exceptions before they're ready to handle them. + * + * This is better than filtering which handlers can be used, + * because refusing to call a handler here is guaranteed to + * result in a hard-to-debug panic. + * + * Keep in mind that not all vectors actually get here. Early + * fage faults, for example, are special. + */ + if (fixup_exception(regs, trapnr)) + return; + + if (fixup_bug(regs, trapnr)) + return; + +fail: + early_printk("PANIC: early exception 0x%02x IP %lx:%lx error %lx cr2 0x%lx\n", + (unsigned)trapnr, (unsigned long)regs->cs, regs->ip, + regs->orig_ax, read_cr2()); + + show_regs(regs); + +halt_loop: + while (true) + halt(); +} diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c new file mode 100644 index 000000000..c61acf635 --- /dev/null +++ b/arch/x86/mm/fault.c @@ -0,0 +1,1490 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 1995 Linus Torvalds + * Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs. + * Copyright (C) 2008-2009, Red Hat Inc., Ingo Molnar + */ +#include <linux/sched.h> /* test_thread_flag(), ... */ +#include <linux/sched/task_stack.h> /* task_stack_*(), ... */ +#include <linux/kdebug.h> /* oops_begin/end, ... */ +#include <linux/extable.h> /* search_exception_tables */ +#include <linux/bootmem.h> /* max_low_pfn */ +#include <linux/kprobes.h> /* NOKPROBE_SYMBOL, ... */ +#include <linux/mmiotrace.h> /* kmmio_handler, ... */ +#include <linux/perf_event.h> /* perf_sw_event */ +#include <linux/hugetlb.h> /* hstate_index_to_shift */ +#include <linux/prefetch.h> /* prefetchw */ +#include <linux/context_tracking.h> /* exception_enter(), ... */ +#include <linux/uaccess.h> /* faulthandler_disabled() */ +#include <linux/mm_types.h> + +#include <asm/cpufeature.h> /* boot_cpu_has, ... */ +#include <asm/traps.h> /* dotraplinkage, ... */ +#include <asm/pgalloc.h> /* pgd_*(), ... */ +#include <asm/fixmap.h> /* VSYSCALL_ADDR */ +#include <asm/vsyscall.h> /* emulate_vsyscall */ +#include <asm/vm86.h> /* struct vm86 */ +#include <asm/mmu_context.h> /* vma_pkey() */ + +#define CREATE_TRACE_POINTS +#include <asm/trace/exceptions.h> + +/* + * Returns 0 if mmiotrace is disabled, or if the fault is not + * handled by mmiotrace: + */ +static nokprobe_inline int +kmmio_fault(struct pt_regs *regs, unsigned long addr) +{ + if (unlikely(is_kmmio_active())) + if (kmmio_handler(regs, addr) == 1) + return -1; + return 0; +} + +static nokprobe_inline int kprobes_fault(struct pt_regs *regs) +{ + int ret = 0; + + /* kprobe_running() needs smp_processor_id() */ + if (kprobes_built_in() && !user_mode(regs)) { + preempt_disable(); + if (kprobe_running() && kprobe_fault_handler(regs, 14)) + ret = 1; + preempt_enable(); + } + + return ret; +} + +/* + * Prefetch quirks: + * + * 32-bit mode: + * + * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch. + * Check that here and ignore it. + * + * 64-bit mode: + * + * Sometimes the CPU reports invalid exceptions on prefetch. + * Check that here and ignore it. + * + * Opcode checker based on code by Richard Brunner. + */ +static inline int +check_prefetch_opcode(struct pt_regs *regs, unsigned char *instr, + unsigned char opcode, int *prefetch) +{ + unsigned char instr_hi = opcode & 0xf0; + unsigned char instr_lo = opcode & 0x0f; + + switch (instr_hi) { + case 0x20: + case 0x30: + /* + * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. + * In X86_64 long mode, the CPU will signal invalid + * opcode if some of these prefixes are present so + * X86_64 will never get here anyway + */ + return ((instr_lo & 7) == 0x6); +#ifdef CONFIG_X86_64 + case 0x40: + /* + * In AMD64 long mode 0x40..0x4F are valid REX prefixes + * Need to figure out under what instruction mode the + * instruction was issued. Could check the LDT for lm, + * but for now it's good enough to assume that long + * mode only uses well known segments or kernel. + */ + return (!user_mode(regs) || user_64bit_mode(regs)); +#endif + case 0x60: + /* 0x64 thru 0x67 are valid prefixes in all modes. */ + return (instr_lo & 0xC) == 0x4; + case 0xF0: + /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */ + return !instr_lo || (instr_lo>>1) == 1; + case 0x00: + /* Prefetch instruction is 0x0F0D or 0x0F18 */ + if (probe_kernel_address(instr, opcode)) + return 0; + + *prefetch = (instr_lo == 0xF) && + (opcode == 0x0D || opcode == 0x18); + return 0; + default: + return 0; + } +} + +static int +is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) +{ + unsigned char *max_instr; + unsigned char *instr; + int prefetch = 0; + + /* + * If it was a exec (instruction fetch) fault on NX page, then + * do not ignore the fault: + */ + if (error_code & X86_PF_INSTR) + return 0; + + instr = (void *)convert_ip_to_linear(current, regs); + max_instr = instr + 15; + + if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE_MAX) + return 0; + + while (instr < max_instr) { + unsigned char opcode; + + if (probe_kernel_address(instr, opcode)) + break; + + instr++; + + if (!check_prefetch_opcode(regs, instr, opcode, &prefetch)) + break; + } + return prefetch; +} + +/* + * A protection key fault means that the PKRU value did not allow + * access to some PTE. Userspace can figure out what PKRU was + * from the XSAVE state, and this function fills out a field in + * siginfo so userspace can discover which protection key was set + * on the PTE. + * + * If we get here, we know that the hardware signaled a X86_PF_PK + * fault and that there was a VMA once we got in the fault + * handler. It does *not* guarantee that the VMA we find here + * was the one that we faulted on. + * + * 1. T1 : mprotect_key(foo, PAGE_SIZE, pkey=4); + * 2. T1 : set PKRU to deny access to pkey=4, touches page + * 3. T1 : faults... + * 4. T2: mprotect_key(foo, PAGE_SIZE, pkey=5); + * 5. T1 : enters fault handler, takes mmap_sem, etc... + * 6. T1 : reaches here, sees vma_pkey(vma)=5, when we really + * faulted on a pte with its pkey=4. + */ +static void fill_sig_info_pkey(int si_signo, int si_code, siginfo_t *info, + u32 *pkey) +{ + /* This is effectively an #ifdef */ + if (!boot_cpu_has(X86_FEATURE_OSPKE)) + return; + + /* Fault not from Protection Keys: nothing to do */ + if ((si_code != SEGV_PKUERR) || (si_signo != SIGSEGV)) + return; + /* + * force_sig_info_fault() is called from a number of + * contexts, some of which have a VMA and some of which + * do not. The X86_PF_PK handing happens after we have a + * valid VMA, so we should never reach this without a + * valid VMA. + */ + if (!pkey) { + WARN_ONCE(1, "PKU fault with no VMA passed in"); + info->si_pkey = 0; + return; + } + /* + * si_pkey should be thought of as a strong hint, but not + * absolutely guranteed to be 100% accurate because of + * the race explained above. + */ + info->si_pkey = *pkey; +} + +static void +force_sig_info_fault(int si_signo, int si_code, unsigned long address, + struct task_struct *tsk, u32 *pkey, int fault) +{ + unsigned lsb = 0; + siginfo_t info; + + clear_siginfo(&info); + info.si_signo = si_signo; + info.si_errno = 0; + info.si_code = si_code; + info.si_addr = (void __user *)address; + if (fault & VM_FAULT_HWPOISON_LARGE) + lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault)); + if (fault & VM_FAULT_HWPOISON) + lsb = PAGE_SHIFT; + info.si_addr_lsb = lsb; + + fill_sig_info_pkey(si_signo, si_code, &info, pkey); + + force_sig_info(si_signo, &info, tsk); +} + +DEFINE_SPINLOCK(pgd_lock); +LIST_HEAD(pgd_list); + +#ifdef CONFIG_X86_32 +static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) +{ + unsigned index = pgd_index(address); + pgd_t *pgd_k; + p4d_t *p4d, *p4d_k; + pud_t *pud, *pud_k; + pmd_t *pmd, *pmd_k; + + pgd += index; + pgd_k = init_mm.pgd + index; + + if (!pgd_present(*pgd_k)) + return NULL; + + /* + * set_pgd(pgd, *pgd_k); here would be useless on PAE + * and redundant with the set_pmd() on non-PAE. As would + * set_p4d/set_pud. + */ + p4d = p4d_offset(pgd, address); + p4d_k = p4d_offset(pgd_k, address); + if (!p4d_present(*p4d_k)) + return NULL; + + pud = pud_offset(p4d, address); + pud_k = pud_offset(p4d_k, address); + if (!pud_present(*pud_k)) + return NULL; + + pmd = pmd_offset(pud, address); + pmd_k = pmd_offset(pud_k, address); + + if (pmd_present(*pmd) != pmd_present(*pmd_k)) + set_pmd(pmd, *pmd_k); + + if (!pmd_present(*pmd_k)) + return NULL; + else + BUG_ON(pmd_pfn(*pmd) != pmd_pfn(*pmd_k)); + + return pmd_k; +} + +static void vmalloc_sync(void) +{ + unsigned long address; + + if (SHARED_KERNEL_PMD) + return; + + for (address = VMALLOC_START & PMD_MASK; + address >= TASK_SIZE_MAX && address < VMALLOC_END; + address += PMD_SIZE) { + struct page *page; + + spin_lock(&pgd_lock); + list_for_each_entry(page, &pgd_list, lru) { + spinlock_t *pgt_lock; + + /* the pgt_lock only for Xen */ + pgt_lock = &pgd_page_get_mm(page)->page_table_lock; + + spin_lock(pgt_lock); + vmalloc_sync_one(page_address(page), address); + spin_unlock(pgt_lock); + } + spin_unlock(&pgd_lock); + } +} + +void vmalloc_sync_mappings(void) +{ + vmalloc_sync(); +} + +void vmalloc_sync_unmappings(void) +{ + vmalloc_sync(); +} + +/* + * 32-bit: + * + * Handle a fault on the vmalloc or module mapping area + */ +static noinline int vmalloc_fault(unsigned long address) +{ + unsigned long pgd_paddr; + pmd_t *pmd_k; + pte_t *pte_k; + + /* Make sure we are in vmalloc area: */ + if (!(address >= VMALLOC_START && address < VMALLOC_END)) + return -1; + + /* + * Synchronize this task's top level page-table + * with the 'reference' page table. + * + * Do _not_ use "current" here. We might be inside + * an interrupt in the middle of a task switch.. + */ + pgd_paddr = read_cr3_pa(); + pmd_k = vmalloc_sync_one(__va(pgd_paddr), address); + if (!pmd_k) + return -1; + + if (pmd_large(*pmd_k)) + return 0; + + pte_k = pte_offset_kernel(pmd_k, address); + if (!pte_present(*pte_k)) + return -1; + + return 0; +} +NOKPROBE_SYMBOL(vmalloc_fault); + +/* + * Did it hit the DOS screen memory VA from vm86 mode? + */ +static inline void +check_v8086_mode(struct pt_regs *regs, unsigned long address, + struct task_struct *tsk) +{ +#ifdef CONFIG_VM86 + unsigned long bit; + + if (!v8086_mode(regs) || !tsk->thread.vm86) + return; + + bit = (address - 0xA0000) >> PAGE_SHIFT; + if (bit < 32) + tsk->thread.vm86->screen_bitmap |= 1 << bit; +#endif +} + +static bool low_pfn(unsigned long pfn) +{ + return pfn < max_low_pfn; +} + +static void dump_pagetable(unsigned long address) +{ + pgd_t *base = __va(read_cr3_pa()); + pgd_t *pgd = &base[pgd_index(address)]; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + +#ifdef CONFIG_X86_PAE + pr_info("*pdpt = %016Lx ", pgd_val(*pgd)); + if (!low_pfn(pgd_val(*pgd) >> PAGE_SHIFT) || !pgd_present(*pgd)) + goto out; +#define pr_pde pr_cont +#else +#define pr_pde pr_info +#endif + p4d = p4d_offset(pgd, address); + pud = pud_offset(p4d, address); + pmd = pmd_offset(pud, address); + pr_pde("*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)pmd_val(*pmd)); +#undef pr_pde + + /* + * We must not directly access the pte in the highpte + * case if the page table is located in highmem. + * And let's rather not kmap-atomic the pte, just in case + * it's allocated already: + */ + if (!low_pfn(pmd_pfn(*pmd)) || !pmd_present(*pmd) || pmd_large(*pmd)) + goto out; + + pte = pte_offset_kernel(pmd, address); + pr_cont("*pte = %0*Lx ", sizeof(*pte) * 2, (u64)pte_val(*pte)); +out: + pr_cont("\n"); +} + +#else /* CONFIG_X86_64: */ + +void vmalloc_sync_mappings(void) +{ + /* + * 64-bit mappings might allocate new p4d/pud pages + * that need to be propagated to all tasks' PGDs. + */ + sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END); +} + +void vmalloc_sync_unmappings(void) +{ + /* + * Unmappings never allocate or free p4d/pud pages. + * No work is required here. + */ +} + +/* + * 64-bit: + * + * Handle a fault on the vmalloc area + */ +static noinline int vmalloc_fault(unsigned long address) +{ + pgd_t *pgd, *pgd_k; + p4d_t *p4d, *p4d_k; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + /* Make sure we are in vmalloc area: */ + if (!(address >= VMALLOC_START && address < VMALLOC_END)) + return -1; + + /* + * Copy kernel mappings over when needed. This can also + * happen within a race in page table update. In the later + * case just flush: + */ + pgd = (pgd_t *)__va(read_cr3_pa()) + pgd_index(address); + pgd_k = pgd_offset_k(address); + if (pgd_none(*pgd_k)) + return -1; + + if (pgtable_l5_enabled()) { + if (pgd_none(*pgd)) { + set_pgd(pgd, *pgd_k); + arch_flush_lazy_mmu_mode(); + } else { + BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_k)); + } + } + + /* With 4-level paging, copying happens on the p4d level. */ + p4d = p4d_offset(pgd, address); + p4d_k = p4d_offset(pgd_k, address); + if (p4d_none(*p4d_k)) + return -1; + + if (p4d_none(*p4d) && !pgtable_l5_enabled()) { + set_p4d(p4d, *p4d_k); + arch_flush_lazy_mmu_mode(); + } else { + BUG_ON(p4d_pfn(*p4d) != p4d_pfn(*p4d_k)); + } + + BUILD_BUG_ON(CONFIG_PGTABLE_LEVELS < 4); + + pud = pud_offset(p4d, address); + if (pud_none(*pud)) + return -1; + + if (pud_large(*pud)) + return 0; + + pmd = pmd_offset(pud, address); + if (pmd_none(*pmd)) + return -1; + + if (pmd_large(*pmd)) + return 0; + + pte = pte_offset_kernel(pmd, address); + if (!pte_present(*pte)) + return -1; + + return 0; +} +NOKPROBE_SYMBOL(vmalloc_fault); + +#ifdef CONFIG_CPU_SUP_AMD +static const char errata93_warning[] = +KERN_ERR +"******* Your BIOS seems to not contain a fix for K8 errata #93\n" +"******* Working around it, but it may cause SEGVs or burn power.\n" +"******* Please consider a BIOS update.\n" +"******* Disabling USB legacy in the BIOS may also help.\n"; +#endif + +/* + * No vm86 mode in 64-bit mode: + */ +static inline void +check_v8086_mode(struct pt_regs *regs, unsigned long address, + struct task_struct *tsk) +{ +} + +static int bad_address(void *p) +{ + unsigned long dummy; + + return probe_kernel_address((unsigned long *)p, dummy); +} + +static void dump_pagetable(unsigned long address) +{ + pgd_t *base = __va(read_cr3_pa()); + pgd_t *pgd = base + pgd_index(address); + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + if (bad_address(pgd)) + goto bad; + + pr_info("PGD %lx ", pgd_val(*pgd)); + + if (!pgd_present(*pgd)) + goto out; + + p4d = p4d_offset(pgd, address); + if (bad_address(p4d)) + goto bad; + + pr_cont("P4D %lx ", p4d_val(*p4d)); + if (!p4d_present(*p4d) || p4d_large(*p4d)) + goto out; + + pud = pud_offset(p4d, address); + if (bad_address(pud)) + goto bad; + + pr_cont("PUD %lx ", pud_val(*pud)); + if (!pud_present(*pud) || pud_large(*pud)) + goto out; + + pmd = pmd_offset(pud, address); + if (bad_address(pmd)) + goto bad; + + pr_cont("PMD %lx ", pmd_val(*pmd)); + if (!pmd_present(*pmd) || pmd_large(*pmd)) + goto out; + + pte = pte_offset_kernel(pmd, address); + if (bad_address(pte)) + goto bad; + + pr_cont("PTE %lx", pte_val(*pte)); +out: + pr_cont("\n"); + return; +bad: + pr_info("BAD\n"); +} + +#endif /* CONFIG_X86_64 */ + +/* + * Workaround for K8 erratum #93 & buggy BIOS. + * + * BIOS SMM functions are required to use a specific workaround + * to avoid corruption of the 64bit RIP register on C stepping K8. + * + * A lot of BIOS that didn't get tested properly miss this. + * + * The OS sees this as a page fault with the upper 32bits of RIP cleared. + * Try to work around it here. + * + * Note we only handle faults in kernel here. + * Does nothing on 32-bit. + */ +static int is_errata93(struct pt_regs *regs, unsigned long address) +{ +#if defined(CONFIG_X86_64) && defined(CONFIG_CPU_SUP_AMD) + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD + || boot_cpu_data.x86 != 0xf) + return 0; + + if (address != regs->ip) + return 0; + + if ((address >> 32) != 0) + return 0; + + address |= 0xffffffffUL << 32; + if ((address >= (u64)_stext && address <= (u64)_etext) || + (address >= MODULES_VADDR && address <= MODULES_END)) { + printk_once(errata93_warning); + regs->ip = address; + return 1; + } +#endif + return 0; +} + +/* + * Work around K8 erratum #100 K8 in compat mode occasionally jumps + * to illegal addresses >4GB. + * + * We catch this in the page fault handler because these addresses + * are not reachable. Just detect this case and return. Any code + * segment in LDT is compatibility mode. + */ +static int is_errata100(struct pt_regs *regs, unsigned long address) +{ +#ifdef CONFIG_X86_64 + if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && (address >> 32)) + return 1; +#endif + return 0; +} + +static int is_f00f_bug(struct pt_regs *regs, unsigned long address) +{ +#ifdef CONFIG_X86_F00F_BUG + unsigned long nr; + + /* + * Pentium F0 0F C7 C8 bug workaround: + */ + if (boot_cpu_has_bug(X86_BUG_F00F)) { + nr = (address - idt_descr.address) >> 3; + + if (nr == 6) { + do_invalid_op(regs, 0); + return 1; + } + } +#endif + return 0; +} + +static void +show_fault_oops(struct pt_regs *regs, unsigned long error_code, + unsigned long address) +{ + if (!oops_may_print()) + return; + + if (error_code & X86_PF_INSTR) { + unsigned int level; + pgd_t *pgd; + pte_t *pte; + + pgd = __va(read_cr3_pa()); + pgd += pgd_index(address); + + pte = lookup_address_in_pgd(pgd, address, &level); + + if (pte && pte_present(*pte) && !pte_exec(*pte)) + pr_crit("kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n", + from_kuid(&init_user_ns, current_uid())); + if (pte && pte_present(*pte) && pte_exec(*pte) && + (pgd_flags(*pgd) & _PAGE_USER) && + (__read_cr4() & X86_CR4_SMEP)) + pr_crit("unable to execute userspace code (SMEP?) (uid: %d)\n", + from_kuid(&init_user_ns, current_uid())); + } + + pr_alert("BUG: unable to handle kernel %s at %px\n", + address < PAGE_SIZE ? "NULL pointer dereference" : "paging request", + (void *)address); + + dump_pagetable(address); +} + +static noinline void +pgtable_bad(struct pt_regs *regs, unsigned long error_code, + unsigned long address) +{ + struct task_struct *tsk; + unsigned long flags; + int sig; + + flags = oops_begin(); + tsk = current; + sig = SIGKILL; + + printk(KERN_ALERT "%s: Corrupted page table at address %lx\n", + tsk->comm, address); + dump_pagetable(address); + + tsk->thread.cr2 = address; + tsk->thread.trap_nr = X86_TRAP_PF; + tsk->thread.error_code = error_code; + + if (__die("Bad pagetable", regs, error_code)) + sig = 0; + + oops_end(flags, regs, sig); +} + +static noinline void +no_context(struct pt_regs *regs, unsigned long error_code, + unsigned long address, int signal, int si_code) +{ + struct task_struct *tsk = current; + unsigned long flags; + int sig; + + /* Are we prepared to handle this kernel fault? */ + if (fixup_exception(regs, X86_TRAP_PF)) { + /* + * Any interrupt that takes a fault gets the fixup. This makes + * the below recursive fault logic only apply to a faults from + * task context. + */ + if (in_interrupt()) + return; + + /* + * Per the above we're !in_interrupt(), aka. task context. + * + * In this case we need to make sure we're not recursively + * faulting through the emulate_vsyscall() logic. + */ + if (current->thread.sig_on_uaccess_err && signal) { + tsk->thread.trap_nr = X86_TRAP_PF; + tsk->thread.error_code = error_code | X86_PF_USER; + tsk->thread.cr2 = address; + + /* XXX: hwpoison faults will set the wrong code. */ + force_sig_info_fault(signal, si_code, address, + tsk, NULL, 0); + } + + /* + * Barring that, we can do the fixup and be happy. + */ + return; + } + +#ifdef CONFIG_VMAP_STACK + /* + * Stack overflow? During boot, we can fault near the initial + * stack in the direct map, but that's not an overflow -- check + * that we're in vmalloc space to avoid this. + */ + if (is_vmalloc_addr((void *)address) && + (((unsigned long)tsk->stack - 1 - address < PAGE_SIZE) || + address - ((unsigned long)tsk->stack + THREAD_SIZE) < PAGE_SIZE)) { + unsigned long stack = this_cpu_read(orig_ist.ist[DOUBLEFAULT_STACK]) - sizeof(void *); + /* + * We're likely to be running with very little stack space + * left. It's plausible that we'd hit this condition but + * double-fault even before we get this far, in which case + * we're fine: the double-fault handler will deal with it. + * + * We don't want to make it all the way into the oops code + * and then double-fault, though, because we're likely to + * break the console driver and lose most of the stack dump. + */ + asm volatile ("movq %[stack], %%rsp\n\t" + "call handle_stack_overflow\n\t" + "1: jmp 1b" + : ASM_CALL_CONSTRAINT + : "D" ("kernel stack overflow (page fault)"), + "S" (regs), "d" (address), + [stack] "rm" (stack)); + unreachable(); + } +#endif + + /* + * 32-bit: + * + * Valid to do another page fault here, because if this fault + * had been triggered by is_prefetch fixup_exception would have + * handled it. + * + * 64-bit: + * + * Hall of shame of CPU/BIOS bugs. + */ + if (is_prefetch(regs, error_code, address)) + return; + + if (is_errata93(regs, address)) + return; + + /* + * Oops. The kernel tried to access some bad page. We'll have to + * terminate things with extreme prejudice: + */ + flags = oops_begin(); + + show_fault_oops(regs, error_code, address); + + if (task_stack_end_corrupted(tsk)) + printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); + + tsk->thread.cr2 = address; + tsk->thread.trap_nr = X86_TRAP_PF; + tsk->thread.error_code = error_code; + + sig = SIGKILL; + if (__die("Oops", regs, error_code)) + sig = 0; + + /* Executive summary in case the body of the oops scrolled away */ + printk(KERN_DEFAULT "CR2: %016lx\n", address); + + oops_end(flags, regs, sig); +} + +/* + * Print out info about fatal segfaults, if the show_unhandled_signals + * sysctl is set: + */ +static inline void +show_signal_msg(struct pt_regs *regs, unsigned long error_code, + unsigned long address, struct task_struct *tsk) +{ + const char *loglvl = task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG; + + if (!unhandled_signal(tsk, SIGSEGV)) + return; + + if (!printk_ratelimit()) + return; + + printk("%s%s[%d]: segfault at %lx ip %px sp %px error %lx", + loglvl, tsk->comm, task_pid_nr(tsk), address, + (void *)regs->ip, (void *)regs->sp, error_code); + + print_vma_addr(KERN_CONT " in ", regs->ip); + + printk(KERN_CONT "\n"); + + show_opcodes(regs, loglvl); +} + +static void +__bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, + unsigned long address, u32 *pkey, int si_code) +{ + struct task_struct *tsk = current; + + /* User mode accesses just cause a SIGSEGV */ + if (error_code & X86_PF_USER) { + /* + * It's possible to have interrupts off here: + */ + local_irq_enable(); + + /* + * Valid to do another page fault here because this one came + * from user space: + */ + if (is_prefetch(regs, error_code, address)) + return; + + if (is_errata100(regs, address)) + return; + +#ifdef CONFIG_X86_64 + /* + * Instruction fetch faults in the vsyscall page might need + * emulation. + */ + if (unlikely((error_code & X86_PF_INSTR) && + ((address & ~0xfff) == VSYSCALL_ADDR))) { + if (emulate_vsyscall(regs, address)) + return; + } +#endif + + /* + * To avoid leaking information about the kernel page table + * layout, pretend that user-mode accesses to kernel addresses + * are always protection faults. + */ + if (address >= TASK_SIZE_MAX) + error_code |= X86_PF_PROT; + + if (likely(show_unhandled_signals)) + show_signal_msg(regs, error_code, address, tsk); + + tsk->thread.cr2 = address; + tsk->thread.error_code = error_code; + tsk->thread.trap_nr = X86_TRAP_PF; + + force_sig_info_fault(SIGSEGV, si_code, address, tsk, pkey, 0); + + return; + } + + if (is_f00f_bug(regs, address)) + return; + + no_context(regs, error_code, address, SIGSEGV, si_code); +} + +static noinline void +bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, + unsigned long address, u32 *pkey) +{ + __bad_area_nosemaphore(regs, error_code, address, pkey, SEGV_MAPERR); +} + +static void +__bad_area(struct pt_regs *regs, unsigned long error_code, + unsigned long address, struct vm_area_struct *vma, int si_code) +{ + struct mm_struct *mm = current->mm; + u32 pkey; + + if (vma) + pkey = vma_pkey(vma); + + /* + * Something tried to access memory that isn't in our memory map.. + * Fix it, but check if it's kernel or user first.. + */ + up_read(&mm->mmap_sem); + + __bad_area_nosemaphore(regs, error_code, address, + (vma) ? &pkey : NULL, si_code); +} + +static noinline void +bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address) +{ + __bad_area(regs, error_code, address, NULL, SEGV_MAPERR); +} + +static inline bool bad_area_access_from_pkeys(unsigned long error_code, + struct vm_area_struct *vma) +{ + /* This code is always called on the current mm */ + bool foreign = false; + + if (!boot_cpu_has(X86_FEATURE_OSPKE)) + return false; + if (error_code & X86_PF_PK) + return true; + /* this checks permission keys on the VMA: */ + if (!arch_vma_access_permitted(vma, (error_code & X86_PF_WRITE), + (error_code & X86_PF_INSTR), foreign)) + return true; + return false; +} + +static noinline void +bad_area_access_error(struct pt_regs *regs, unsigned long error_code, + unsigned long address, struct vm_area_struct *vma) +{ + /* + * This OSPKE check is not strictly necessary at runtime. + * But, doing it this way allows compiler optimizations + * if pkeys are compiled out. + */ + if (bad_area_access_from_pkeys(error_code, vma)) + __bad_area(regs, error_code, address, vma, SEGV_PKUERR); + else + __bad_area(regs, error_code, address, vma, SEGV_ACCERR); +} + +static void +do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, + u32 *pkey, unsigned int fault) +{ + struct task_struct *tsk = current; + int code = BUS_ADRERR; + + /* Kernel mode? Handle exceptions or die: */ + if (!(error_code & X86_PF_USER)) { + no_context(regs, error_code, address, SIGBUS, BUS_ADRERR); + return; + } + + /* User-space => ok to do another page fault: */ + if (is_prefetch(regs, error_code, address)) + return; + + tsk->thread.cr2 = address; + tsk->thread.error_code = error_code; + tsk->thread.trap_nr = X86_TRAP_PF; + +#ifdef CONFIG_MEMORY_FAILURE + if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) { + printk(KERN_ERR + "MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n", + tsk->comm, tsk->pid, address); + code = BUS_MCEERR_AR; + } +#endif + force_sig_info_fault(SIGBUS, code, address, tsk, pkey, fault); +} + +static noinline void +mm_fault_error(struct pt_regs *regs, unsigned long error_code, + unsigned long address, u32 *pkey, vm_fault_t fault) +{ + if (fatal_signal_pending(current) && !(error_code & X86_PF_USER)) { + no_context(regs, error_code, address, 0, 0); + return; + } + + if (fault & VM_FAULT_OOM) { + /* Kernel mode? Handle exceptions or die: */ + if (!(error_code & X86_PF_USER)) { + no_context(regs, error_code, address, + SIGSEGV, SEGV_MAPERR); + return; + } + + /* + * We ran out of memory, call the OOM killer, and return the + * userspace (which will retry the fault, or kill us if we got + * oom-killed): + */ + pagefault_out_of_memory(); + } else { + if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON| + VM_FAULT_HWPOISON_LARGE)) + do_sigbus(regs, error_code, address, pkey, fault); + else if (fault & VM_FAULT_SIGSEGV) + bad_area_nosemaphore(regs, error_code, address, pkey); + else + BUG(); + } +} + +static int spurious_fault_check(unsigned long error_code, pte_t *pte) +{ + if ((error_code & X86_PF_WRITE) && !pte_write(*pte)) + return 0; + + if ((error_code & X86_PF_INSTR) && !pte_exec(*pte)) + return 0; + /* + * Note: We do not do lazy flushing on protection key + * changes, so no spurious fault will ever set X86_PF_PK. + */ + if ((error_code & X86_PF_PK)) + return 1; + + return 1; +} + +/* + * Handle a spurious fault caused by a stale TLB entry. + * + * This allows us to lazily refresh the TLB when increasing the + * permissions of a kernel page (RO -> RW or NX -> X). Doing it + * eagerly is very expensive since that implies doing a full + * cross-processor TLB flush, even if no stale TLB entries exist + * on other processors. + * + * Spurious faults may only occur if the TLB contains an entry with + * fewer permission than the page table entry. Non-present (P = 0) + * and reserved bit (R = 1) faults are never spurious. + * + * There are no security implications to leaving a stale TLB when + * increasing the permissions on a page. + * + * Returns non-zero if a spurious fault was handled, zero otherwise. + * + * See Intel Developer's Manual Vol 3 Section 4.10.4.3, bullet 3 + * (Optional Invalidation). + */ +static noinline int +spurious_fault(unsigned long error_code, unsigned long address) +{ + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + int ret; + + /* + * Only writes to RO or instruction fetches from NX may cause + * spurious faults. + * + * These could be from user or supervisor accesses but the TLB + * is only lazily flushed after a kernel mapping protection + * change, so user accesses are not expected to cause spurious + * faults. + */ + if (error_code != (X86_PF_WRITE | X86_PF_PROT) && + error_code != (X86_PF_INSTR | X86_PF_PROT)) + return 0; + + pgd = init_mm.pgd + pgd_index(address); + if (!pgd_present(*pgd)) + return 0; + + p4d = p4d_offset(pgd, address); + if (!p4d_present(*p4d)) + return 0; + + if (p4d_large(*p4d)) + return spurious_fault_check(error_code, (pte_t *) p4d); + + pud = pud_offset(p4d, address); + if (!pud_present(*pud)) + return 0; + + if (pud_large(*pud)) + return spurious_fault_check(error_code, (pte_t *) pud); + + pmd = pmd_offset(pud, address); + if (!pmd_present(*pmd)) + return 0; + + if (pmd_large(*pmd)) + return spurious_fault_check(error_code, (pte_t *) pmd); + + pte = pte_offset_kernel(pmd, address); + if (!pte_present(*pte)) + return 0; + + ret = spurious_fault_check(error_code, pte); + if (!ret) + return 0; + + /* + * Make sure we have permissions in PMD. + * If not, then there's a bug in the page tables: + */ + ret = spurious_fault_check(error_code, (pte_t *) pmd); + WARN_ONCE(!ret, "PMD has incorrect permission bits\n"); + + return ret; +} +NOKPROBE_SYMBOL(spurious_fault); + +int show_unhandled_signals = 1; + +static inline int +access_error(unsigned long error_code, struct vm_area_struct *vma) +{ + /* This is only called for the current mm, so: */ + bool foreign = false; + + /* + * Read or write was blocked by protection keys. This is + * always an unconditional error and can never result in + * a follow-up action to resolve the fault, like a COW. + */ + if (error_code & X86_PF_PK) + return 1; + + /* + * Make sure to check the VMA so that we do not perform + * faults just to hit a X86_PF_PK as soon as we fill in a + * page. + */ + if (!arch_vma_access_permitted(vma, (error_code & X86_PF_WRITE), + (error_code & X86_PF_INSTR), foreign)) + return 1; + + if (error_code & X86_PF_WRITE) { + /* write, present and write, not present: */ + if (unlikely(!(vma->vm_flags & VM_WRITE))) + return 1; + return 0; + } + + /* read, present: */ + if (unlikely(error_code & X86_PF_PROT)) + return 1; + + /* read, not present: */ + if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))) + return 1; + + return 0; +} + +static int fault_in_kernel_space(unsigned long address) +{ + return address >= TASK_SIZE_MAX; +} + +static inline bool smap_violation(int error_code, struct pt_regs *regs) +{ + if (!IS_ENABLED(CONFIG_X86_SMAP)) + return false; + + if (!static_cpu_has(X86_FEATURE_SMAP)) + return false; + + if (error_code & X86_PF_USER) + return false; + + if (!user_mode(regs) && (regs->flags & X86_EFLAGS_AC)) + return false; + + return true; +} + +/* + * This routine handles page faults. It determines the address, + * and the problem, and then passes it off to one of the appropriate + * routines. + */ +static noinline void +__do_page_fault(struct pt_regs *regs, unsigned long error_code, + unsigned long address) +{ + struct vm_area_struct *vma; + struct task_struct *tsk; + struct mm_struct *mm; + vm_fault_t fault, major = 0; + unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; + u32 pkey; + + tsk = current; + mm = tsk->mm; + + prefetchw(&mm->mmap_sem); + + if (unlikely(kmmio_fault(regs, address))) + return; + + /* + * We fault-in kernel-space virtual memory on-demand. The + * 'reference' page table is init_mm.pgd. + * + * NOTE! We MUST NOT take any locks for this case. We may + * be in an interrupt or a critical region, and should + * only copy the information from the master page table, + * nothing more. + * + * This verifies that the fault happens in kernel space + * (error_code & 4) == 0, and that the fault was not a + * protection error (error_code & 9) == 0. + */ + if (unlikely(fault_in_kernel_space(address))) { + if (!(error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) { + if (vmalloc_fault(address) >= 0) + return; + } + + /* Can handle a stale RO->RW TLB: */ + if (spurious_fault(error_code, address)) + return; + + /* kprobes don't want to hook the spurious faults: */ + if (kprobes_fault(regs)) + return; + /* + * Don't take the mm semaphore here. If we fixup a prefetch + * fault we could otherwise deadlock: + */ + bad_area_nosemaphore(regs, error_code, address, NULL); + + return; + } + + /* kprobes don't want to hook the spurious faults: */ + if (unlikely(kprobes_fault(regs))) + return; + + if (unlikely(error_code & X86_PF_RSVD)) + pgtable_bad(regs, error_code, address); + + if (unlikely(smap_violation(error_code, regs))) { + bad_area_nosemaphore(regs, error_code, address, NULL); + return; + } + + /* + * If we're in an interrupt, have no user context or are running + * in a region with pagefaults disabled then we must not take the fault + */ + if (unlikely(faulthandler_disabled() || !mm)) { + bad_area_nosemaphore(regs, error_code, address, NULL); + return; + } + + /* + * It's safe to allow irq's after cr2 has been saved and the + * vmalloc fault has been handled. + * + * User-mode registers count as a user access even for any + * potential system fault or CPU buglet: + */ + if (user_mode(regs)) { + local_irq_enable(); + error_code |= X86_PF_USER; + flags |= FAULT_FLAG_USER; + } else { + if (regs->flags & X86_EFLAGS_IF) + local_irq_enable(); + } + + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); + + if (error_code & X86_PF_WRITE) + flags |= FAULT_FLAG_WRITE; + if (error_code & X86_PF_INSTR) + flags |= FAULT_FLAG_INSTRUCTION; + + /* + * When running in the kernel we expect faults to occur only to + * addresses in user space. All other faults represent errors in + * the kernel and should generate an OOPS. Unfortunately, in the + * case of an erroneous fault occurring in a code path which already + * holds mmap_sem we will deadlock attempting to validate the fault + * against the address space. Luckily the kernel only validly + * references user space from well defined areas of code, which are + * listed in the exceptions table. + * + * As the vast majority of faults will be valid we will only perform + * the source reference check when there is a possibility of a + * deadlock. Attempt to lock the address space, if we cannot we then + * validate the source. If this is invalid we can skip the address + * space check, thus avoiding the deadlock: + */ + if (unlikely(!down_read_trylock(&mm->mmap_sem))) { + if (!(error_code & X86_PF_USER) && + !search_exception_tables(regs->ip)) { + bad_area_nosemaphore(regs, error_code, address, NULL); + return; + } +retry: + down_read(&mm->mmap_sem); + } else { + /* + * The above down_read_trylock() might have succeeded in + * which case we'll have missed the might_sleep() from + * down_read(): + */ + might_sleep(); + } + + vma = find_vma(mm, address); + if (unlikely(!vma)) { + bad_area(regs, error_code, address); + return; + } + if (likely(vma->vm_start <= address)) + goto good_area; + if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) { + bad_area(regs, error_code, address); + return; + } + if (error_code & X86_PF_USER) { + /* + * Accessing the stack below %sp is always a bug. + * The large cushion allows instructions like enter + * and pusha to work. ("enter $65535, $31" pushes + * 32 pointers and then decrements %sp by 65535.) + */ + if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) { + bad_area(regs, error_code, address); + return; + } + } + if (unlikely(expand_stack(vma, address))) { + bad_area(regs, error_code, address); + return; + } + + /* + * Ok, we have a good vm_area for this memory access, so + * we can handle it.. + */ +good_area: + if (unlikely(access_error(error_code, vma))) { + bad_area_access_error(regs, error_code, address, vma); + return; + } + + /* + * If for any reason at all we couldn't handle the fault, + * make sure we exit gracefully rather than endlessly redo + * the fault. Since we never set FAULT_FLAG_RETRY_NOWAIT, if + * we get VM_FAULT_RETRY back, the mmap_sem has been unlocked. + * + * Note that handle_userfault() may also release and reacquire mmap_sem + * (and not return with VM_FAULT_RETRY), when returning to userland to + * repeat the page fault later with a VM_FAULT_NOPAGE retval + * (potentially after handling any pending signal during the return to + * userland). The return to userland is identified whenever + * FAULT_FLAG_USER|FAULT_FLAG_KILLABLE are both set in flags. + * Thus we have to be careful about not touching vma after handling the + * fault, so we read the pkey beforehand. + */ + pkey = vma_pkey(vma); + fault = handle_mm_fault(vma, address, flags); + major |= fault & VM_FAULT_MAJOR; + + /* + * If we need to retry the mmap_sem has already been released, + * and if there is a fatal signal pending there is no guarantee + * that we made any progress. Handle this case first. + */ + if (unlikely(fault & VM_FAULT_RETRY)) { + /* Retry at most once */ + if (flags & FAULT_FLAG_ALLOW_RETRY) { + flags &= ~FAULT_FLAG_ALLOW_RETRY; + flags |= FAULT_FLAG_TRIED; + if (!fatal_signal_pending(tsk)) + goto retry; + } + + /* User mode? Just return to handle the fatal exception */ + if (flags & FAULT_FLAG_USER) + return; + + /* Not returning to user mode? Handle exceptions or die: */ + no_context(regs, error_code, address, SIGBUS, BUS_ADRERR); + return; + } + + up_read(&mm->mmap_sem); + if (unlikely(fault & VM_FAULT_ERROR)) { + mm_fault_error(regs, error_code, address, &pkey, fault); + return; + } + + /* + * Major/minor page fault accounting. If any of the events + * returned VM_FAULT_MAJOR, we account it as a major fault. + */ + if (major) { + tsk->maj_flt++; + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address); + } else { + tsk->min_flt++; + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address); + } + + check_v8086_mode(regs, address, tsk); +} +NOKPROBE_SYMBOL(__do_page_fault); + +static nokprobe_inline void +trace_page_fault_entries(unsigned long address, struct pt_regs *regs, + unsigned long error_code) +{ + if (user_mode(regs)) + trace_page_fault_user(address, regs, error_code); + else + trace_page_fault_kernel(address, regs, error_code); +} + +/* + * We must have this function blacklisted from kprobes, tagged with notrace + * and call read_cr2() before calling anything else. To avoid calling any + * kind of tracing machinery before we've observed the CR2 value. + * + * exception_{enter,exit}() contains all sorts of tracepoints. + */ +dotraplinkage void notrace +do_page_fault(struct pt_regs *regs, unsigned long error_code) +{ + unsigned long address = read_cr2(); /* Get the faulting address */ + enum ctx_state prev_state; + + prev_state = exception_enter(); + if (trace_pagefault_enabled()) + trace_page_fault_entries(address, regs, error_code); + + __do_page_fault(regs, error_code, address); + exception_exit(prev_state); +} +NOKPROBE_SYMBOL(do_page_fault); diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c new file mode 100644 index 000000000..6d18b70ed --- /dev/null +++ b/arch/x86/mm/highmem_32.c @@ -0,0 +1,133 @@ +#include <linux/highmem.h> +#include <linux/export.h> +#include <linux/swap.h> /* for totalram_pages */ +#include <linux/bootmem.h> + +void *kmap(struct page *page) +{ + might_sleep(); + if (!PageHighMem(page)) + return page_address(page); + return kmap_high(page); +} +EXPORT_SYMBOL(kmap); + +void kunmap(struct page *page) +{ + if (in_interrupt()) + BUG(); + if (!PageHighMem(page)) + return; + kunmap_high(page); +} +EXPORT_SYMBOL(kunmap); + +/* + * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because + * no global lock is needed and because the kmap code must perform a global TLB + * invalidation when the kmap pool wraps. + * + * However when holding an atomic kmap it is not legal to sleep, so atomic + * kmaps are appropriate for short, tight code paths only. + */ +void *kmap_atomic_prot(struct page *page, pgprot_t prot) +{ + unsigned long vaddr; + int idx, type; + + preempt_disable(); + pagefault_disable(); + + if (!PageHighMem(page)) + return page_address(page); + + type = kmap_atomic_idx_push(); + idx = type + KM_TYPE_NR*smp_processor_id(); + vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); + BUG_ON(!pte_none(*(kmap_pte-idx))); + set_pte(kmap_pte-idx, mk_pte(page, prot)); + arch_flush_lazy_mmu_mode(); + + return (void *)vaddr; +} +EXPORT_SYMBOL(kmap_atomic_prot); + +void *kmap_atomic(struct page *page) +{ + return kmap_atomic_prot(page, kmap_prot); +} +EXPORT_SYMBOL(kmap_atomic); + +/* + * This is the same as kmap_atomic() but can map memory that doesn't + * have a struct page associated with it. + */ +void *kmap_atomic_pfn(unsigned long pfn) +{ + return kmap_atomic_prot_pfn(pfn, kmap_prot); +} +EXPORT_SYMBOL_GPL(kmap_atomic_pfn); + +void __kunmap_atomic(void *kvaddr) +{ + unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; + + if (vaddr >= __fix_to_virt(FIX_KMAP_END) && + vaddr <= __fix_to_virt(FIX_KMAP_BEGIN)) { + int idx, type; + + type = kmap_atomic_idx(); + idx = type + KM_TYPE_NR * smp_processor_id(); + +#ifdef CONFIG_DEBUG_HIGHMEM + WARN_ON_ONCE(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx)); +#endif + /* + * Force other mappings to Oops if they'll try to access this + * pte without first remap it. Keeping stale mappings around + * is a bad idea also, in case the page changes cacheability + * attributes or becomes a protected page in a hypervisor. + */ + kpte_clear_flush(kmap_pte-idx, vaddr); + kmap_atomic_idx_pop(); + arch_flush_lazy_mmu_mode(); + } +#ifdef CONFIG_DEBUG_HIGHMEM + else { + BUG_ON(vaddr < PAGE_OFFSET); + BUG_ON(vaddr >= (unsigned long)high_memory); + } +#endif + + pagefault_enable(); + preempt_enable(); +} +EXPORT_SYMBOL(__kunmap_atomic); + +void __init set_highmem_pages_init(void) +{ + struct zone *zone; + int nid; + + /* + * Explicitly reset zone->managed_pages because set_highmem_pages_init() + * is invoked before free_all_bootmem() + */ + reset_all_zones_managed_pages(); + for_each_zone(zone) { + unsigned long zone_start_pfn, zone_end_pfn; + + if (!is_highmem(zone)) + continue; + + zone_start_pfn = zone->zone_start_pfn; + zone_end_pfn = zone_start_pfn + zone->spanned_pages; + + nid = zone_to_nid(zone); + printk(KERN_INFO "Initializing %s for node %d (%08lx:%08lx)\n", + zone->name, nid, zone_start_pfn, zone_end_pfn); + + add_highpages_with_active_regions(nid, zone_start_pfn, + zone_end_pfn); + } +} diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c new file mode 100644 index 000000000..00b296617 --- /dev/null +++ b/arch/x86/mm/hugetlbpage.c @@ -0,0 +1,216 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * IA-32 Huge TLB Page Support for Kernel. + * + * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com> + */ + +#include <linux/init.h> +#include <linux/fs.h> +#include <linux/mm.h> +#include <linux/sched/mm.h> +#include <linux/hugetlb.h> +#include <linux/pagemap.h> +#include <linux/err.h> +#include <linux/sysctl.h> +#include <linux/compat.h> +#include <asm/mman.h> +#include <asm/tlb.h> +#include <asm/tlbflush.h> +#include <asm/pgalloc.h> +#include <asm/elf.h> +#include <asm/mpx.h> + +#if 0 /* This is just for testing */ +struct page * +follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) +{ + unsigned long start = address; + int length = 1; + int nr; + struct page *page; + struct vm_area_struct *vma; + + vma = find_vma(mm, addr); + if (!vma || !is_vm_hugetlb_page(vma)) + return ERR_PTR(-EINVAL); + + pte = huge_pte_offset(mm, address, vma_mmu_pagesize(vma)); + + /* hugetlb should be locked, and hence, prefaulted */ + WARN_ON(!pte || pte_none(*pte)); + + page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)]; + + WARN_ON(!PageHead(page)); + + return page; +} + +int pmd_huge(pmd_t pmd) +{ + return 0; +} + +int pud_huge(pud_t pud) +{ + return 0; +} + +#else + +/* + * pmd_huge() returns 1 if @pmd is hugetlb related entry, that is normal + * hugetlb entry or non-present (migration or hwpoisoned) hugetlb entry. + * Otherwise, returns 0. + */ +int pmd_huge(pmd_t pmd) +{ + return !pmd_none(pmd) && + (pmd_val(pmd) & (_PAGE_PRESENT|_PAGE_PSE)) != _PAGE_PRESENT; +} + +int pud_huge(pud_t pud) +{ + return !!(pud_val(pud) & _PAGE_PSE); +} +#endif + +#ifdef CONFIG_HUGETLB_PAGE +static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file, + unsigned long addr, unsigned long len, + unsigned long pgoff, unsigned long flags) +{ + struct hstate *h = hstate_file(file); + struct vm_unmapped_area_info info; + + info.flags = 0; + info.length = len; + info.low_limit = get_mmap_base(1); + + /* + * If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area + * in the full address space. + */ + info.high_limit = in_compat_syscall() ? + task_size_32bit() : task_size_64bit(addr > DEFAULT_MAP_WINDOW); + + info.align_mask = PAGE_MASK & ~huge_page_mask(h); + info.align_offset = 0; + return vm_unmapped_area(&info); +} + +static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file, + unsigned long addr, unsigned long len, + unsigned long pgoff, unsigned long flags) +{ + struct hstate *h = hstate_file(file); + struct vm_unmapped_area_info info; + + info.flags = VM_UNMAPPED_AREA_TOPDOWN; + info.length = len; + info.low_limit = PAGE_SIZE; + info.high_limit = get_mmap_base(0); + + /* + * If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area + * in the full address space. + */ + if (addr > DEFAULT_MAP_WINDOW && !in_compat_syscall()) + info.high_limit += TASK_SIZE_MAX - DEFAULT_MAP_WINDOW; + + info.align_mask = PAGE_MASK & ~huge_page_mask(h); + info.align_offset = 0; + addr = vm_unmapped_area(&info); + + /* + * A failed mmap() very likely causes application failure, + * so fall back to the bottom-up function here. This scenario + * can happen with large stack limits and large mmap() + * allocations. + */ + if (addr & ~PAGE_MASK) { + VM_BUG_ON(addr != -ENOMEM); + info.flags = 0; + info.low_limit = TASK_UNMAPPED_BASE; + info.high_limit = TASK_SIZE_LOW; + addr = vm_unmapped_area(&info); + } + + return addr; +} + +unsigned long +hugetlb_get_unmapped_area(struct file *file, unsigned long addr, + unsigned long len, unsigned long pgoff, unsigned long flags) +{ + struct hstate *h = hstate_file(file); + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + + if (len & ~huge_page_mask(h)) + return -EINVAL; + + addr = mpx_unmapped_area_check(addr, len, flags); + if (IS_ERR_VALUE(addr)) + return addr; + + if (len > TASK_SIZE) + return -ENOMEM; + + /* No address checking. See comment at mmap_address_hint_valid() */ + if (flags & MAP_FIXED) { + if (prepare_hugepage_range(file, addr, len)) + return -EINVAL; + return addr; + } + + if (addr) { + addr &= huge_page_mask(h); + if (!mmap_address_hint_valid(addr, len)) + goto get_unmapped_area; + + vma = find_vma(mm, addr); + if (!vma || addr + len <= vm_start_gap(vma)) + return addr; + } + +get_unmapped_area: + if (mm->get_unmapped_area == arch_get_unmapped_area) + return hugetlb_get_unmapped_area_bottomup(file, addr, len, + pgoff, flags); + else + return hugetlb_get_unmapped_area_topdown(file, addr, len, + pgoff, flags); +} +#endif /* CONFIG_HUGETLB_PAGE */ + +#ifdef CONFIG_X86_64 +static __init int setup_hugepagesz(char *opt) +{ + unsigned long ps = memparse(opt, &opt); + if (ps == PMD_SIZE) { + hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT); + } else if (ps == PUD_SIZE && boot_cpu_has(X86_FEATURE_GBPAGES)) { + hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT); + } else { + hugetlb_bad_size(); + printk(KERN_ERR "hugepagesz: Unsupported page size %lu M\n", + ps >> 20); + return 0; + } + return 1; +} +__setup("hugepagesz=", setup_hugepagesz); + +#if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA) +static __init int gigantic_pages_init(void) +{ + /* With compaction or CMA we can allocate gigantic pages at runtime */ + if (boot_cpu_has(X86_FEATURE_GBPAGES) && !size_to_hstate(1UL << PUD_SHIFT)) + hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT); + return 0; +} +arch_initcall(gigantic_pages_init); +#endif +#endif diff --git a/arch/x86/mm/ident_map.c b/arch/x86/mm/ident_map.c new file mode 100644 index 000000000..968d7005f --- /dev/null +++ b/arch/x86/mm/ident_map.c @@ -0,0 +1,147 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Helper routines for building identity mapping page tables. This is + * included by both the compressed kernel and the regular kernel. + */ + +static void ident_pmd_init(struct x86_mapping_info *info, pmd_t *pmd_page, + unsigned long addr, unsigned long end) +{ + addr &= PMD_MASK; + for (; addr < end; addr += PMD_SIZE) { + pmd_t *pmd = pmd_page + pmd_index(addr); + + if (pmd_present(*pmd)) + continue; + + set_pmd(pmd, __pmd((addr - info->offset) | info->page_flag)); + } +} + +static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page, + unsigned long addr, unsigned long end) +{ + unsigned long next; + + for (; addr < end; addr = next) { + pud_t *pud = pud_page + pud_index(addr); + pmd_t *pmd; + + next = (addr & PUD_MASK) + PUD_SIZE; + if (next > end) + next = end; + + if (info->direct_gbpages) { + pud_t pudval; + + if (pud_present(*pud)) + continue; + + addr &= PUD_MASK; + pudval = __pud((addr - info->offset) | info->page_flag); + set_pud(pud, pudval); + continue; + } + + if (pud_present(*pud)) { + pmd = pmd_offset(pud, 0); + ident_pmd_init(info, pmd, addr, next); + continue; + } + pmd = (pmd_t *)info->alloc_pgt_page(info->context); + if (!pmd) + return -ENOMEM; + ident_pmd_init(info, pmd, addr, next); + set_pud(pud, __pud(__pa(pmd) | info->kernpg_flag)); + } + + return 0; +} + +static int ident_p4d_init(struct x86_mapping_info *info, p4d_t *p4d_page, + unsigned long addr, unsigned long end) +{ + unsigned long next; + int result; + + for (; addr < end; addr = next) { + p4d_t *p4d = p4d_page + p4d_index(addr); + pud_t *pud; + + next = (addr & P4D_MASK) + P4D_SIZE; + if (next > end) + next = end; + + if (p4d_present(*p4d)) { + pud = pud_offset(p4d, 0); + result = ident_pud_init(info, pud, addr, next); + if (result) + return result; + + continue; + } + pud = (pud_t *)info->alloc_pgt_page(info->context); + if (!pud) + return -ENOMEM; + + result = ident_pud_init(info, pud, addr, next); + if (result) + return result; + + set_p4d(p4d, __p4d(__pa(pud) | info->kernpg_flag)); + } + + return 0; +} + +int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page, + unsigned long pstart, unsigned long pend) +{ + unsigned long addr = pstart + info->offset; + unsigned long end = pend + info->offset; + unsigned long next; + int result; + + /* Set the default pagetable flags if not supplied */ + if (!info->kernpg_flag) + info->kernpg_flag = _KERNPG_TABLE; + + /* Filter out unsupported __PAGE_KERNEL_* bits: */ + info->kernpg_flag &= __default_kernel_pte_mask; + + for (; addr < end; addr = next) { + pgd_t *pgd = pgd_page + pgd_index(addr); + p4d_t *p4d; + + next = (addr & PGDIR_MASK) + PGDIR_SIZE; + if (next > end) + next = end; + + if (pgd_present(*pgd)) { + p4d = p4d_offset(pgd, 0); + result = ident_p4d_init(info, p4d, addr, next); + if (result) + return result; + continue; + } + + p4d = (p4d_t *)info->alloc_pgt_page(info->context); + if (!p4d) + return -ENOMEM; + result = ident_p4d_init(info, p4d, addr, next); + if (result) + return result; + if (pgtable_l5_enabled()) { + set_pgd(pgd, __pgd(__pa(p4d) | info->kernpg_flag)); + } else { + /* + * With p4d folded, pgd is equal to p4d. + * The pgd entry has to point to the pud page table in this case. + */ + pud_t *pud = pud_offset(p4d, 0); + set_pgd(pgd, __pgd(__pa(pud) | info->kernpg_flag)); + } + } + + return 0; +} diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c new file mode 100644 index 000000000..b1dba0987 --- /dev/null +++ b/arch/x86/mm/init.c @@ -0,0 +1,953 @@ +#include <linux/gfp.h> +#include <linux/initrd.h> +#include <linux/ioport.h> +#include <linux/swap.h> +#include <linux/memblock.h> +#include <linux/bootmem.h> /* for max_low_pfn */ +#include <linux/swapfile.h> +#include <linux/swapops.h> +#include <linux/kmemleak.h> + +#include <asm/set_memory.h> +#include <asm/e820/api.h> +#include <asm/init.h> +#include <asm/page.h> +#include <asm/page_types.h> +#include <asm/sections.h> +#include <asm/setup.h> +#include <asm/tlbflush.h> +#include <asm/tlb.h> +#include <asm/proto.h> +#include <asm/dma.h> /* for MAX_DMA_PFN */ +#include <asm/microcode.h> +#include <asm/kaslr.h> +#include <asm/hypervisor.h> +#include <asm/cpufeature.h> +#include <asm/pti.h> + +/* + * We need to define the tracepoints somewhere, and tlb.c + * is only compied when SMP=y. + */ +#define CREATE_TRACE_POINTS +#include <trace/events/tlb.h> + +#include "mm_internal.h" + +/* + * Tables translating between page_cache_type_t and pte encoding. + * + * The default values are defined statically as minimal supported mode; + * WC and WT fall back to UC-. pat_init() updates these values to support + * more cache modes, WC and WT, when it is safe to do so. See pat_init() + * for the details. Note, __early_ioremap() used during early boot-time + * takes pgprot_t (pte encoding) and does not use these tables. + * + * Index into __cachemode2pte_tbl[] is the cachemode. + * + * Index into __pte2cachemode_tbl[] are the caching attribute bits of the pte + * (_PAGE_PWT, _PAGE_PCD, _PAGE_PAT) at index bit positions 0, 1, 2. + */ +uint16_t __cachemode2pte_tbl[_PAGE_CACHE_MODE_NUM] = { + [_PAGE_CACHE_MODE_WB ] = 0 | 0 , + [_PAGE_CACHE_MODE_WC ] = 0 | _PAGE_PCD, + [_PAGE_CACHE_MODE_UC_MINUS] = 0 | _PAGE_PCD, + [_PAGE_CACHE_MODE_UC ] = _PAGE_PWT | _PAGE_PCD, + [_PAGE_CACHE_MODE_WT ] = 0 | _PAGE_PCD, + [_PAGE_CACHE_MODE_WP ] = 0 | _PAGE_PCD, +}; +EXPORT_SYMBOL(__cachemode2pte_tbl); + +uint8_t __pte2cachemode_tbl[8] = { + [__pte2cm_idx( 0 | 0 | 0 )] = _PAGE_CACHE_MODE_WB, + [__pte2cm_idx(_PAGE_PWT | 0 | 0 )] = _PAGE_CACHE_MODE_UC_MINUS, + [__pte2cm_idx( 0 | _PAGE_PCD | 0 )] = _PAGE_CACHE_MODE_UC_MINUS, + [__pte2cm_idx(_PAGE_PWT | _PAGE_PCD | 0 )] = _PAGE_CACHE_MODE_UC, + [__pte2cm_idx( 0 | 0 | _PAGE_PAT)] = _PAGE_CACHE_MODE_WB, + [__pte2cm_idx(_PAGE_PWT | 0 | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC_MINUS, + [__pte2cm_idx(0 | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC_MINUS, + [__pte2cm_idx(_PAGE_PWT | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC, +}; +EXPORT_SYMBOL(__pte2cachemode_tbl); + +static unsigned long __initdata pgt_buf_start; +static unsigned long __initdata pgt_buf_end; +static unsigned long __initdata pgt_buf_top; + +static unsigned long min_pfn_mapped; + +static bool __initdata can_use_brk_pgt = true; + +/* + * Pages returned are already directly mapped. + * + * Changing that is likely to break Xen, see commit: + * + * 279b706 x86,xen: introduce x86_init.mapping.pagetable_reserve + * + * for detailed information. + */ +__ref void *alloc_low_pages(unsigned int num) +{ + unsigned long pfn; + int i; + + if (after_bootmem) { + unsigned int order; + + order = get_order((unsigned long)num << PAGE_SHIFT); + return (void *)__get_free_pages(GFP_ATOMIC | __GFP_ZERO, order); + } + + if ((pgt_buf_end + num) > pgt_buf_top || !can_use_brk_pgt) { + unsigned long ret = 0; + + if (min_pfn_mapped < max_pfn_mapped) { + ret = memblock_find_in_range( + min_pfn_mapped << PAGE_SHIFT, + max_pfn_mapped << PAGE_SHIFT, + PAGE_SIZE * num , PAGE_SIZE); + } + if (ret) + memblock_reserve(ret, PAGE_SIZE * num); + else if (can_use_brk_pgt) + ret = __pa(extend_brk(PAGE_SIZE * num, PAGE_SIZE)); + + if (!ret) + panic("alloc_low_pages: can not alloc memory"); + + pfn = ret >> PAGE_SHIFT; + } else { + pfn = pgt_buf_end; + pgt_buf_end += num; + } + + for (i = 0; i < num; i++) { + void *adr; + + adr = __va((pfn + i) << PAGE_SHIFT); + clear_page(adr); + } + + return __va(pfn << PAGE_SHIFT); +} + +/* + * By default need 3 4k for initial PMD_SIZE, 3 4k for 0-ISA_END_ADDRESS. + * With KASLR memory randomization, depending on the machine e820 memory + * and the PUD alignment. We may need twice more pages when KASLR memory + * randomization is enabled. + */ +#ifndef CONFIG_RANDOMIZE_MEMORY +#define INIT_PGD_PAGE_COUNT 6 +#else +#define INIT_PGD_PAGE_COUNT 12 +#endif +#define INIT_PGT_BUF_SIZE (INIT_PGD_PAGE_COUNT * PAGE_SIZE) +RESERVE_BRK(early_pgt_alloc, INIT_PGT_BUF_SIZE); +void __init early_alloc_pgt_buf(void) +{ + unsigned long tables = INIT_PGT_BUF_SIZE; + phys_addr_t base; + + base = __pa(extend_brk(tables, PAGE_SIZE)); + + pgt_buf_start = base >> PAGE_SHIFT; + pgt_buf_end = pgt_buf_start; + pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT); +} + +int after_bootmem; + +early_param_on_off("gbpages", "nogbpages", direct_gbpages, CONFIG_X86_DIRECT_GBPAGES); + +struct map_range { + unsigned long start; + unsigned long end; + unsigned page_size_mask; +}; + +static int page_size_mask; + +static void __init probe_page_size_mask(void) +{ + /* + * For pagealloc debugging, identity mapping will use small pages. + * This will simplify cpa(), which otherwise needs to support splitting + * large pages into small in interrupt context, etc. + */ + if (boot_cpu_has(X86_FEATURE_PSE) && !debug_pagealloc_enabled()) + page_size_mask |= 1 << PG_LEVEL_2M; + else + direct_gbpages = 0; + + /* Enable PSE if available */ + if (boot_cpu_has(X86_FEATURE_PSE)) + cr4_set_bits_and_update_boot(X86_CR4_PSE); + + /* Enable PGE if available */ + __supported_pte_mask &= ~_PAGE_GLOBAL; + if (boot_cpu_has(X86_FEATURE_PGE)) { + cr4_set_bits_and_update_boot(X86_CR4_PGE); + __supported_pte_mask |= _PAGE_GLOBAL; + } + + /* By the default is everything supported: */ + __default_kernel_pte_mask = __supported_pte_mask; + /* Except when with PTI where the kernel is mostly non-Global: */ + if (cpu_feature_enabled(X86_FEATURE_PTI)) + __default_kernel_pte_mask &= ~_PAGE_GLOBAL; + + /* Enable 1 GB linear kernel mappings if available: */ + if (direct_gbpages && boot_cpu_has(X86_FEATURE_GBPAGES)) { + printk(KERN_INFO "Using GB pages for direct mapping\n"); + page_size_mask |= 1 << PG_LEVEL_1G; + } else { + direct_gbpages = 0; + } +} + +static void setup_pcid(void) +{ + if (!IS_ENABLED(CONFIG_X86_64)) + return; + + if (!boot_cpu_has(X86_FEATURE_PCID)) + return; + + if (boot_cpu_has(X86_FEATURE_PGE)) { + /* + * This can't be cr4_set_bits_and_update_boot() -- the + * trampoline code can't handle CR4.PCIDE and it wouldn't + * do any good anyway. Despite the name, + * cr4_set_bits_and_update_boot() doesn't actually cause + * the bits in question to remain set all the way through + * the secondary boot asm. + * + * Instead, we brute-force it and set CR4.PCIDE manually in + * start_secondary(). + */ + cr4_set_bits(X86_CR4_PCIDE); + + /* + * INVPCID's single-context modes (2/3) only work if we set + * X86_CR4_PCIDE, *and* we INVPCID support. It's unusable + * on systems that have X86_CR4_PCIDE clear, or that have + * no INVPCID support at all. + */ + if (boot_cpu_has(X86_FEATURE_INVPCID)) + setup_force_cpu_cap(X86_FEATURE_INVPCID_SINGLE); + } else { + /* + * flush_tlb_all(), as currently implemented, won't work if + * PCID is on but PGE is not. Since that combination + * doesn't exist on real hardware, there's no reason to try + * to fully support it, but it's polite to avoid corrupting + * data if we're on an improperly configured VM. + */ + setup_clear_cpu_cap(X86_FEATURE_PCID); + } +} + +#ifdef CONFIG_X86_32 +#define NR_RANGE_MR 3 +#else /* CONFIG_X86_64 */ +#define NR_RANGE_MR 5 +#endif + +static int __meminit save_mr(struct map_range *mr, int nr_range, + unsigned long start_pfn, unsigned long end_pfn, + unsigned long page_size_mask) +{ + if (start_pfn < end_pfn) { + if (nr_range >= NR_RANGE_MR) + panic("run out of range for init_memory_mapping\n"); + mr[nr_range].start = start_pfn<<PAGE_SHIFT; + mr[nr_range].end = end_pfn<<PAGE_SHIFT; + mr[nr_range].page_size_mask = page_size_mask; + nr_range++; + } + + return nr_range; +} + +/* + * adjust the page_size_mask for small range to go with + * big page size instead small one if nearby are ram too. + */ +static void __ref adjust_range_page_size_mask(struct map_range *mr, + int nr_range) +{ + int i; + + for (i = 0; i < nr_range; i++) { + if ((page_size_mask & (1<<PG_LEVEL_2M)) && + !(mr[i].page_size_mask & (1<<PG_LEVEL_2M))) { + unsigned long start = round_down(mr[i].start, PMD_SIZE); + unsigned long end = round_up(mr[i].end, PMD_SIZE); + +#ifdef CONFIG_X86_32 + if ((end >> PAGE_SHIFT) > max_low_pfn) + continue; +#endif + + if (memblock_is_region_memory(start, end - start)) + mr[i].page_size_mask |= 1<<PG_LEVEL_2M; + } + if ((page_size_mask & (1<<PG_LEVEL_1G)) && + !(mr[i].page_size_mask & (1<<PG_LEVEL_1G))) { + unsigned long start = round_down(mr[i].start, PUD_SIZE); + unsigned long end = round_up(mr[i].end, PUD_SIZE); + + if (memblock_is_region_memory(start, end - start)) + mr[i].page_size_mask |= 1<<PG_LEVEL_1G; + } + } +} + +static const char *page_size_string(struct map_range *mr) +{ + static const char str_1g[] = "1G"; + static const char str_2m[] = "2M"; + static const char str_4m[] = "4M"; + static const char str_4k[] = "4k"; + + if (mr->page_size_mask & (1<<PG_LEVEL_1G)) + return str_1g; + /* + * 32-bit without PAE has a 4M large page size. + * PG_LEVEL_2M is misnamed, but we can at least + * print out the right size in the string. + */ + if (IS_ENABLED(CONFIG_X86_32) && + !IS_ENABLED(CONFIG_X86_PAE) && + mr->page_size_mask & (1<<PG_LEVEL_2M)) + return str_4m; + + if (mr->page_size_mask & (1<<PG_LEVEL_2M)) + return str_2m; + + return str_4k; +} + +static int __meminit split_mem_range(struct map_range *mr, int nr_range, + unsigned long start, + unsigned long end) +{ + unsigned long start_pfn, end_pfn, limit_pfn; + unsigned long pfn; + int i; + + limit_pfn = PFN_DOWN(end); + + /* head if not big page alignment ? */ + pfn = start_pfn = PFN_DOWN(start); +#ifdef CONFIG_X86_32 + /* + * Don't use a large page for the first 2/4MB of memory + * because there are often fixed size MTRRs in there + * and overlapping MTRRs into large pages can cause + * slowdowns. + */ + if (pfn == 0) + end_pfn = PFN_DOWN(PMD_SIZE); + else + end_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE)); +#else /* CONFIG_X86_64 */ + end_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE)); +#endif + if (end_pfn > limit_pfn) + end_pfn = limit_pfn; + if (start_pfn < end_pfn) { + nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); + pfn = end_pfn; + } + + /* big page (2M) range */ + start_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE)); +#ifdef CONFIG_X86_32 + end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE)); +#else /* CONFIG_X86_64 */ + end_pfn = round_up(pfn, PFN_DOWN(PUD_SIZE)); + if (end_pfn > round_down(limit_pfn, PFN_DOWN(PMD_SIZE))) + end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE)); +#endif + + if (start_pfn < end_pfn) { + nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, + page_size_mask & (1<<PG_LEVEL_2M)); + pfn = end_pfn; + } + +#ifdef CONFIG_X86_64 + /* big page (1G) range */ + start_pfn = round_up(pfn, PFN_DOWN(PUD_SIZE)); + end_pfn = round_down(limit_pfn, PFN_DOWN(PUD_SIZE)); + if (start_pfn < end_pfn) { + nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, + page_size_mask & + ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G))); + pfn = end_pfn; + } + + /* tail is not big page (1G) alignment */ + start_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE)); + end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE)); + if (start_pfn < end_pfn) { + nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, + page_size_mask & (1<<PG_LEVEL_2M)); + pfn = end_pfn; + } +#endif + + /* tail is not big page (2M) alignment */ + start_pfn = pfn; + end_pfn = limit_pfn; + nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); + + if (!after_bootmem) + adjust_range_page_size_mask(mr, nr_range); + + /* try to merge same page size and continuous */ + for (i = 0; nr_range > 1 && i < nr_range - 1; i++) { + unsigned long old_start; + if (mr[i].end != mr[i+1].start || + mr[i].page_size_mask != mr[i+1].page_size_mask) + continue; + /* move it */ + old_start = mr[i].start; + memmove(&mr[i], &mr[i+1], + (nr_range - 1 - i) * sizeof(struct map_range)); + mr[i--].start = old_start; + nr_range--; + } + + for (i = 0; i < nr_range; i++) + pr_debug(" [mem %#010lx-%#010lx] page %s\n", + mr[i].start, mr[i].end - 1, + page_size_string(&mr[i])); + + return nr_range; +} + +struct range pfn_mapped[E820_MAX_ENTRIES]; +int nr_pfn_mapped; + +static void add_pfn_range_mapped(unsigned long start_pfn, unsigned long end_pfn) +{ + nr_pfn_mapped = add_range_with_merge(pfn_mapped, E820_MAX_ENTRIES, + nr_pfn_mapped, start_pfn, end_pfn); + nr_pfn_mapped = clean_sort_range(pfn_mapped, E820_MAX_ENTRIES); + + max_pfn_mapped = max(max_pfn_mapped, end_pfn); + + if (start_pfn < (1UL<<(32-PAGE_SHIFT))) + max_low_pfn_mapped = max(max_low_pfn_mapped, + min(end_pfn, 1UL<<(32-PAGE_SHIFT))); +} + +bool pfn_range_is_mapped(unsigned long start_pfn, unsigned long end_pfn) +{ + int i; + + for (i = 0; i < nr_pfn_mapped; i++) + if ((start_pfn >= pfn_mapped[i].start) && + (end_pfn <= pfn_mapped[i].end)) + return true; + + return false; +} + +/* + * Setup the direct mapping of the physical memory at PAGE_OFFSET. + * This runs before bootmem is initialized and gets pages directly from + * the physical memory. To access them they are temporarily mapped. + */ +unsigned long __ref init_memory_mapping(unsigned long start, + unsigned long end) +{ + struct map_range mr[NR_RANGE_MR]; + unsigned long ret = 0; + int nr_range, i; + + pr_debug("init_memory_mapping: [mem %#010lx-%#010lx]\n", + start, end - 1); + + memset(mr, 0, sizeof(mr)); + nr_range = split_mem_range(mr, 0, start, end); + + for (i = 0; i < nr_range; i++) + ret = kernel_physical_mapping_init(mr[i].start, mr[i].end, + mr[i].page_size_mask); + + add_pfn_range_mapped(start >> PAGE_SHIFT, ret >> PAGE_SHIFT); + + return ret >> PAGE_SHIFT; +} + +/* + * We need to iterate through the E820 memory map and create direct mappings + * for only E820_TYPE_RAM and E820_KERN_RESERVED regions. We cannot simply + * create direct mappings for all pfns from [0 to max_low_pfn) and + * [4GB to max_pfn) because of possible memory holes in high addresses + * that cannot be marked as UC by fixed/variable range MTRRs. + * Depending on the alignment of E820 ranges, this may possibly result + * in using smaller size (i.e. 4K instead of 2M or 1G) page tables. + * + * init_mem_mapping() calls init_range_memory_mapping() with big range. + * That range would have hole in the middle or ends, and only ram parts + * will be mapped in init_range_memory_mapping(). + */ +static unsigned long __init init_range_memory_mapping( + unsigned long r_start, + unsigned long r_end) +{ + unsigned long start_pfn, end_pfn; + unsigned long mapped_ram_size = 0; + int i; + + for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) { + u64 start = clamp_val(PFN_PHYS(start_pfn), r_start, r_end); + u64 end = clamp_val(PFN_PHYS(end_pfn), r_start, r_end); + if (start >= end) + continue; + + /* + * if it is overlapping with brk pgt, we need to + * alloc pgt buf from memblock instead. + */ + can_use_brk_pgt = max(start, (u64)pgt_buf_end<<PAGE_SHIFT) >= + min(end, (u64)pgt_buf_top<<PAGE_SHIFT); + init_memory_mapping(start, end); + mapped_ram_size += end - start; + can_use_brk_pgt = true; + } + + return mapped_ram_size; +} + +static unsigned long __init get_new_step_size(unsigned long step_size) +{ + /* + * Initial mapped size is PMD_SIZE (2M). + * We can not set step_size to be PUD_SIZE (1G) yet. + * In worse case, when we cross the 1G boundary, and + * PG_LEVEL_2M is not set, we will need 1+1+512 pages (2M + 8k) + * to map 1G range with PTE. Hence we use one less than the + * difference of page table level shifts. + * + * Don't need to worry about overflow in the top-down case, on 32bit, + * when step_size is 0, round_down() returns 0 for start, and that + * turns it into 0x100000000ULL. + * In the bottom-up case, round_up(x, 0) returns 0 though too, which + * needs to be taken into consideration by the code below. + */ + return step_size << (PMD_SHIFT - PAGE_SHIFT - 1); +} + +/** + * memory_map_top_down - Map [map_start, map_end) top down + * @map_start: start address of the target memory range + * @map_end: end address of the target memory range + * + * This function will setup direct mapping for memory range + * [map_start, map_end) in top-down. That said, the page tables + * will be allocated at the end of the memory, and we map the + * memory in top-down. + */ +static void __init memory_map_top_down(unsigned long map_start, + unsigned long map_end) +{ + unsigned long real_end, start, last_start; + unsigned long step_size; + unsigned long addr; + unsigned long mapped_ram_size = 0; + + /* xen has big range in reserved near end of ram, skip it at first.*/ + addr = memblock_find_in_range(map_start, map_end, PMD_SIZE, PMD_SIZE); + real_end = addr + PMD_SIZE; + + /* step_size need to be small so pgt_buf from BRK could cover it */ + step_size = PMD_SIZE; + max_pfn_mapped = 0; /* will get exact value next */ + min_pfn_mapped = real_end >> PAGE_SHIFT; + last_start = start = real_end; + + /* + * We start from the top (end of memory) and go to the bottom. + * The memblock_find_in_range() gets us a block of RAM from the + * end of RAM in [min_pfn_mapped, max_pfn_mapped) used as new pages + * for page table. + */ + while (last_start > map_start) { + if (last_start > step_size) { + start = round_down(last_start - 1, step_size); + if (start < map_start) + start = map_start; + } else + start = map_start; + mapped_ram_size += init_range_memory_mapping(start, + last_start); + last_start = start; + min_pfn_mapped = last_start >> PAGE_SHIFT; + if (mapped_ram_size >= step_size) + step_size = get_new_step_size(step_size); + } + + if (real_end < map_end) + init_range_memory_mapping(real_end, map_end); +} + +/** + * memory_map_bottom_up - Map [map_start, map_end) bottom up + * @map_start: start address of the target memory range + * @map_end: end address of the target memory range + * + * This function will setup direct mapping for memory range + * [map_start, map_end) in bottom-up. Since we have limited the + * bottom-up allocation above the kernel, the page tables will + * be allocated just above the kernel and we map the memory + * in [map_start, map_end) in bottom-up. + */ +static void __init memory_map_bottom_up(unsigned long map_start, + unsigned long map_end) +{ + unsigned long next, start; + unsigned long mapped_ram_size = 0; + /* step_size need to be small so pgt_buf from BRK could cover it */ + unsigned long step_size = PMD_SIZE; + + start = map_start; + min_pfn_mapped = start >> PAGE_SHIFT; + + /* + * We start from the bottom (@map_start) and go to the top (@map_end). + * The memblock_find_in_range() gets us a block of RAM from the + * end of RAM in [min_pfn_mapped, max_pfn_mapped) used as new pages + * for page table. + */ + while (start < map_end) { + if (step_size && map_end - start > step_size) { + next = round_up(start + 1, step_size); + if (next > map_end) + next = map_end; + } else { + next = map_end; + } + + mapped_ram_size += init_range_memory_mapping(start, next); + start = next; + + if (mapped_ram_size >= step_size) + step_size = get_new_step_size(step_size); + } +} + +void __init init_mem_mapping(void) +{ + unsigned long end; + + pti_check_boottime_disable(); + probe_page_size_mask(); + setup_pcid(); + +#ifdef CONFIG_X86_64 + end = max_pfn << PAGE_SHIFT; +#else + end = max_low_pfn << PAGE_SHIFT; +#endif + + /* the ISA range is always mapped regardless of memory holes */ + init_memory_mapping(0, ISA_END_ADDRESS); + + /* Init the trampoline, possibly with KASLR memory offset */ + init_trampoline(); + + /* + * If the allocation is in bottom-up direction, we setup direct mapping + * in bottom-up, otherwise we setup direct mapping in top-down. + */ + if (memblock_bottom_up()) { + unsigned long kernel_end = __pa_symbol(_end); + + /* + * we need two separate calls here. This is because we want to + * allocate page tables above the kernel. So we first map + * [kernel_end, end) to make memory above the kernel be mapped + * as soon as possible. And then use page tables allocated above + * the kernel to map [ISA_END_ADDRESS, kernel_end). + */ + memory_map_bottom_up(kernel_end, end); + memory_map_bottom_up(ISA_END_ADDRESS, kernel_end); + } else { + memory_map_top_down(ISA_END_ADDRESS, end); + } + +#ifdef CONFIG_X86_64 + if (max_pfn > max_low_pfn) { + /* can we preseve max_low_pfn ?*/ + max_low_pfn = max_pfn; + } +#else + early_ioremap_page_table_range_init(); +#endif + + load_cr3(swapper_pg_dir); + __flush_tlb_all(); + + x86_init.hyper.init_mem_mapping(); + + early_memtest(0, max_pfn_mapped << PAGE_SHIFT); +} + +/* + * devmem_is_allowed() checks to see if /dev/mem access to a certain address + * is valid. The argument is a physical page number. + * + * On x86, access has to be given to the first megabyte of RAM because that + * area traditionally contains BIOS code and data regions used by X, dosemu, + * and similar apps. Since they map the entire memory range, the whole range + * must be allowed (for mapping), but any areas that would otherwise be + * disallowed are flagged as being "zero filled" instead of rejected. + * Access has to be given to non-kernel-ram areas as well, these contain the + * PCI mmio resources as well as potential bios/acpi data regions. + */ +int devmem_is_allowed(unsigned long pagenr) +{ + if (region_intersects(PFN_PHYS(pagenr), PAGE_SIZE, + IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE) + != REGION_DISJOINT) { + /* + * For disallowed memory regions in the low 1MB range, + * request that the page be shown as all zeros. + */ + if (pagenr < 256) + return 2; + + return 0; + } + + /* + * This must follow RAM test, since System RAM is considered a + * restricted resource under CONFIG_STRICT_IOMEM. + */ + if (iomem_is_exclusive(pagenr << PAGE_SHIFT)) { + /* Low 1MB bypasses iomem restrictions. */ + if (pagenr < 256) + return 1; + + return 0; + } + + return 1; +} + +void free_init_pages(char *what, unsigned long begin, unsigned long end) +{ + unsigned long begin_aligned, end_aligned; + + /* Make sure boundaries are page aligned */ + begin_aligned = PAGE_ALIGN(begin); + end_aligned = end & PAGE_MASK; + + if (WARN_ON(begin_aligned != begin || end_aligned != end)) { + begin = begin_aligned; + end = end_aligned; + } + + if (begin >= end) + return; + + /* + * If debugging page accesses then do not free this memory but + * mark them not present - any buggy init-section access will + * create a kernel page fault: + */ + if (debug_pagealloc_enabled()) { + pr_info("debug: unmapping init [mem %#010lx-%#010lx]\n", + begin, end - 1); + /* + * Inform kmemleak about the hole in the memory since the + * corresponding pages will be unmapped. + */ + kmemleak_free_part((void *)begin, end - begin); + set_memory_np(begin, (end - begin) >> PAGE_SHIFT); + } else { + /* + * We just marked the kernel text read only above, now that + * we are going to free part of that, we need to make that + * writeable and non-executable first. + */ + set_memory_nx(begin, (end - begin) >> PAGE_SHIFT); + set_memory_rw(begin, (end - begin) >> PAGE_SHIFT); + + free_reserved_area((void *)begin, (void *)end, + POISON_FREE_INITMEM, what); + } +} + +/* + * begin/end can be in the direct map or the "high kernel mapping" + * used for the kernel image only. free_init_pages() will do the + * right thing for either kind of address. + */ +void free_kernel_image_pages(void *begin, void *end) +{ + unsigned long begin_ul = (unsigned long)begin; + unsigned long end_ul = (unsigned long)end; + unsigned long len_pages = (end_ul - begin_ul) >> PAGE_SHIFT; + + + free_init_pages("unused kernel image", begin_ul, end_ul); + + /* + * PTI maps some of the kernel into userspace. For performance, + * this includes some kernel areas that do not contain secrets. + * Those areas might be adjacent to the parts of the kernel image + * being freed, which may contain secrets. Remove the "high kernel + * image mapping" for these freed areas, ensuring they are not even + * potentially vulnerable to Meltdown regardless of the specific + * optimizations PTI is currently using. + * + * The "noalias" prevents unmapping the direct map alias which is + * needed to access the freed pages. + * + * This is only valid for 64bit kernels. 32bit has only one mapping + * which can't be treated in this way for obvious reasons. + */ + if (IS_ENABLED(CONFIG_X86_64) && cpu_feature_enabled(X86_FEATURE_PTI)) + set_memory_np_noalias(begin_ul, len_pages); +} + +void __weak mem_encrypt_free_decrypted_mem(void) { } + +void __ref free_initmem(void) +{ + e820__reallocate_tables(); + + mem_encrypt_free_decrypted_mem(); + + free_kernel_image_pages(&__init_begin, &__init_end); +} + +#ifdef CONFIG_BLK_DEV_INITRD +void __init free_initrd_mem(unsigned long start, unsigned long end) +{ + /* + * end could be not aligned, and We can not align that, + * decompresser could be confused by aligned initrd_end + * We already reserve the end partial page before in + * - i386_start_kernel() + * - x86_64_start_kernel() + * - relocate_initrd() + * So here We can do PAGE_ALIGN() safely to get partial page to be freed + */ + free_init_pages("initrd", start, PAGE_ALIGN(end)); +} +#endif + +/* + * Calculate the precise size of the DMA zone (first 16 MB of RAM), + * and pass it to the MM layer - to help it set zone watermarks more + * accurately. + * + * Done on 64-bit systems only for the time being, although 32-bit systems + * might benefit from this as well. + */ +void __init memblock_find_dma_reserve(void) +{ +#ifdef CONFIG_X86_64 + u64 nr_pages = 0, nr_free_pages = 0; + unsigned long start_pfn, end_pfn; + phys_addr_t start_addr, end_addr; + int i; + u64 u; + + /* + * Iterate over all memory ranges (free and reserved ones alike), + * to calculate the total number of pages in the first 16 MB of RAM: + */ + nr_pages = 0; + for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) { + start_pfn = min(start_pfn, MAX_DMA_PFN); + end_pfn = min(end_pfn, MAX_DMA_PFN); + + nr_pages += end_pfn - start_pfn; + } + + /* + * Iterate over free memory ranges to calculate the number of free + * pages in the DMA zone, while not counting potential partial + * pages at the beginning or the end of the range: + */ + nr_free_pages = 0; + for_each_free_mem_range(u, NUMA_NO_NODE, MEMBLOCK_NONE, &start_addr, &end_addr, NULL) { + start_pfn = min_t(unsigned long, PFN_UP(start_addr), MAX_DMA_PFN); + end_pfn = min_t(unsigned long, PFN_DOWN(end_addr), MAX_DMA_PFN); + + if (start_pfn < end_pfn) + nr_free_pages += end_pfn - start_pfn; + } + + set_dma_reserve(nr_pages - nr_free_pages); +#endif +} + +void __init zone_sizes_init(void) +{ + unsigned long max_zone_pfns[MAX_NR_ZONES]; + + memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); + +#ifdef CONFIG_ZONE_DMA + max_zone_pfns[ZONE_DMA] = min(MAX_DMA_PFN, max_low_pfn); +#endif +#ifdef CONFIG_ZONE_DMA32 + max_zone_pfns[ZONE_DMA32] = min(MAX_DMA32_PFN, max_low_pfn); +#endif + max_zone_pfns[ZONE_NORMAL] = max_low_pfn; +#ifdef CONFIG_HIGHMEM + max_zone_pfns[ZONE_HIGHMEM] = max_pfn; +#endif + + free_area_init_nodes(max_zone_pfns); +} + +__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = { + .loaded_mm = &init_mm, + .next_asid = 1, + .cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */ +}; +EXPORT_PER_CPU_SYMBOL(cpu_tlbstate); + +void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache) +{ + /* entry 0 MUST be WB (hardwired to speed up translations) */ + BUG_ON(!entry && cache != _PAGE_CACHE_MODE_WB); + + __cachemode2pte_tbl[cache] = __cm_idx2pte(entry); + __pte2cachemode_tbl[entry] = cache; +} + +#ifdef CONFIG_SWAP +unsigned long max_swapfile_size(void) +{ + unsigned long pages; + + pages = generic_max_swapfile_size(); + + if (boot_cpu_has_bug(X86_BUG_L1TF) && l1tf_mitigation != L1TF_MITIGATION_OFF) { + /* Limit the swap file size to MAX_PA/2 for L1TF workaround */ + unsigned long long l1tf_limit = l1tf_pfn_limit(); + /* + * We encode swap offsets also with 3 bits below those for pfn + * which makes the usable limit higher. + */ +#if CONFIG_PGTABLE_LEVELS > 2 + l1tf_limit <<= PAGE_SHIFT - SWP_OFFSET_FIRST_BIT; +#endif + pages = min_t(unsigned long long, l1tf_limit, pages); + } + return pages; +} +#endif diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c new file mode 100644 index 000000000..79b95910f --- /dev/null +++ b/arch/x86/mm/init_32.c @@ -0,0 +1,956 @@ +/* + * + * Copyright (C) 1995 Linus Torvalds + * + * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 + */ + +#include <linux/signal.h> +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/string.h> +#include <linux/types.h> +#include <linux/ptrace.h> +#include <linux/mman.h> +#include <linux/mm.h> +#include <linux/hugetlb.h> +#include <linux/swap.h> +#include <linux/smp.h> +#include <linux/init.h> +#include <linux/highmem.h> +#include <linux/pagemap.h> +#include <linux/pci.h> +#include <linux/pfn.h> +#include <linux/poison.h> +#include <linux/bootmem.h> +#include <linux/memblock.h> +#include <linux/proc_fs.h> +#include <linux/memory_hotplug.h> +#include <linux/initrd.h> +#include <linux/cpumask.h> +#include <linux/gfp.h> + +#include <asm/asm.h> +#include <asm/bios_ebda.h> +#include <asm/processor.h> +#include <linux/uaccess.h> +#include <asm/pgtable.h> +#include <asm/dma.h> +#include <asm/fixmap.h> +#include <asm/e820/api.h> +#include <asm/apic.h> +#include <asm/bugs.h> +#include <asm/tlb.h> +#include <asm/tlbflush.h> +#include <asm/olpc_ofw.h> +#include <asm/pgalloc.h> +#include <asm/sections.h> +#include <asm/paravirt.h> +#include <asm/setup.h> +#include <asm/set_memory.h> +#include <asm/page_types.h> +#include <asm/cpu_entry_area.h> +#include <asm/init.h> + +#include "mm_internal.h" + +unsigned long highstart_pfn, highend_pfn; + +bool __read_mostly __vmalloc_start_set = false; + +/* + * Creates a middle page table and puts a pointer to it in the + * given global directory entry. This only returns the gd entry + * in non-PAE compilation mode, since the middle layer is folded. + */ +static pmd_t * __init one_md_table_init(pgd_t *pgd) +{ + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd_table; + +#ifdef CONFIG_X86_PAE + if (!(pgd_val(*pgd) & _PAGE_PRESENT)) { + pmd_table = (pmd_t *)alloc_low_page(); + paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT); + set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); + p4d = p4d_offset(pgd, 0); + pud = pud_offset(p4d, 0); + BUG_ON(pmd_table != pmd_offset(pud, 0)); + + return pmd_table; + } +#endif + p4d = p4d_offset(pgd, 0); + pud = pud_offset(p4d, 0); + pmd_table = pmd_offset(pud, 0); + + return pmd_table; +} + +/* + * Create a page table and place a pointer to it in a middle page + * directory entry: + */ +static pte_t * __init one_page_table_init(pmd_t *pmd) +{ + if (!(pmd_val(*pmd) & _PAGE_PRESENT)) { + pte_t *page_table = (pte_t *)alloc_low_page(); + + paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT); + set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE)); + BUG_ON(page_table != pte_offset_kernel(pmd, 0)); + } + + return pte_offset_kernel(pmd, 0); +} + +pmd_t * __init populate_extra_pmd(unsigned long vaddr) +{ + int pgd_idx = pgd_index(vaddr); + int pmd_idx = pmd_index(vaddr); + + return one_md_table_init(swapper_pg_dir + pgd_idx) + pmd_idx; +} + +pte_t * __init populate_extra_pte(unsigned long vaddr) +{ + int pte_idx = pte_index(vaddr); + pmd_t *pmd; + + pmd = populate_extra_pmd(vaddr); + return one_page_table_init(pmd) + pte_idx; +} + +static unsigned long __init +page_table_range_init_count(unsigned long start, unsigned long end) +{ + unsigned long count = 0; +#ifdef CONFIG_HIGHMEM + int pmd_idx_kmap_begin = fix_to_virt(FIX_KMAP_END) >> PMD_SHIFT; + int pmd_idx_kmap_end = fix_to_virt(FIX_KMAP_BEGIN) >> PMD_SHIFT; + int pgd_idx, pmd_idx; + unsigned long vaddr; + + if (pmd_idx_kmap_begin == pmd_idx_kmap_end) + return 0; + + vaddr = start; + pgd_idx = pgd_index(vaddr); + pmd_idx = pmd_index(vaddr); + + for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd_idx++) { + for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); + pmd_idx++) { + if ((vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin && + (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end) + count++; + vaddr += PMD_SIZE; + } + pmd_idx = 0; + } +#endif + return count; +} + +static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd, + unsigned long vaddr, pte_t *lastpte, + void **adr) +{ +#ifdef CONFIG_HIGHMEM + /* + * Something (early fixmap) may already have put a pte + * page here, which causes the page table allocation + * to become nonlinear. Attempt to fix it, and if it + * is still nonlinear then we have to bug. + */ + int pmd_idx_kmap_begin = fix_to_virt(FIX_KMAP_END) >> PMD_SHIFT; + int pmd_idx_kmap_end = fix_to_virt(FIX_KMAP_BEGIN) >> PMD_SHIFT; + + if (pmd_idx_kmap_begin != pmd_idx_kmap_end + && (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin + && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end) { + pte_t *newpte; + int i; + + BUG_ON(after_bootmem); + newpte = *adr; + for (i = 0; i < PTRS_PER_PTE; i++) + set_pte(newpte + i, pte[i]); + *adr = (void *)(((unsigned long)(*adr)) + PAGE_SIZE); + + paravirt_alloc_pte(&init_mm, __pa(newpte) >> PAGE_SHIFT); + set_pmd(pmd, __pmd(__pa(newpte)|_PAGE_TABLE)); + BUG_ON(newpte != pte_offset_kernel(pmd, 0)); + __flush_tlb_all(); + + paravirt_release_pte(__pa(pte) >> PAGE_SHIFT); + pte = newpte; + } + BUG_ON(vaddr < fix_to_virt(FIX_KMAP_BEGIN - 1) + && vaddr > fix_to_virt(FIX_KMAP_END) + && lastpte && lastpte + PTRS_PER_PTE != pte); +#endif + return pte; +} + +/* + * This function initializes a certain range of kernel virtual memory + * with new bootmem page tables, everywhere page tables are missing in + * the given range. + * + * NOTE: The pagetables are allocated contiguous on the physical space + * so we can cache the place of the first one and move around without + * checking the pgd every time. + */ +static void __init +page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base) +{ + int pgd_idx, pmd_idx; + unsigned long vaddr; + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte = NULL; + unsigned long count = page_table_range_init_count(start, end); + void *adr = NULL; + + if (count) + adr = alloc_low_pages(count); + + vaddr = start; + pgd_idx = pgd_index(vaddr); + pmd_idx = pmd_index(vaddr); + pgd = pgd_base + pgd_idx; + + for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) { + pmd = one_md_table_init(pgd); + pmd = pmd + pmd_index(vaddr); + for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); + pmd++, pmd_idx++) { + pte = page_table_kmap_check(one_page_table_init(pmd), + pmd, vaddr, pte, &adr); + + vaddr += PMD_SIZE; + } + pmd_idx = 0; + } +} + +static inline int is_kernel_text(unsigned long addr) +{ + if (addr >= (unsigned long)_text && addr <= (unsigned long)__init_end) + return 1; + return 0; +} + +/* + * This maps the physical memory to kernel virtual address space, a total + * of max_low_pfn pages, by creating page tables starting from address + * PAGE_OFFSET: + */ +unsigned long __init +kernel_physical_mapping_init(unsigned long start, + unsigned long end, + unsigned long page_size_mask) +{ + int use_pse = page_size_mask == (1<<PG_LEVEL_2M); + unsigned long last_map_addr = end; + unsigned long start_pfn, end_pfn; + pgd_t *pgd_base = swapper_pg_dir; + int pgd_idx, pmd_idx, pte_ofs; + unsigned long pfn; + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte; + unsigned pages_2m, pages_4k; + int mapping_iter; + + start_pfn = start >> PAGE_SHIFT; + end_pfn = end >> PAGE_SHIFT; + + /* + * First iteration will setup identity mapping using large/small pages + * based on use_pse, with other attributes same as set by + * the early code in head_32.S + * + * Second iteration will setup the appropriate attributes (NX, GLOBAL..) + * as desired for the kernel identity mapping. + * + * This two pass mechanism conforms to the TLB app note which says: + * + * "Software should not write to a paging-structure entry in a way + * that would change, for any linear address, both the page size + * and either the page frame or attributes." + */ + mapping_iter = 1; + + if (!boot_cpu_has(X86_FEATURE_PSE)) + use_pse = 0; + +repeat: + pages_2m = pages_4k = 0; + pfn = start_pfn; + pgd_idx = pgd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET); + pgd = pgd_base + pgd_idx; + for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) { + pmd = one_md_table_init(pgd); + + if (pfn >= end_pfn) + continue; +#ifdef CONFIG_X86_PAE + pmd_idx = pmd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET); + pmd += pmd_idx; +#else + pmd_idx = 0; +#endif + for (; pmd_idx < PTRS_PER_PMD && pfn < end_pfn; + pmd++, pmd_idx++) { + unsigned int addr = pfn * PAGE_SIZE + PAGE_OFFSET; + + /* + * Map with big pages if possible, otherwise + * create normal page tables: + */ + if (use_pse) { + unsigned int addr2; + pgprot_t prot = PAGE_KERNEL_LARGE; + /* + * first pass will use the same initial + * identity mapping attribute + _PAGE_PSE. + */ + pgprot_t init_prot = + __pgprot(PTE_IDENT_ATTR | + _PAGE_PSE); + + pfn &= PMD_MASK >> PAGE_SHIFT; + addr2 = (pfn + PTRS_PER_PTE-1) * PAGE_SIZE + + PAGE_OFFSET + PAGE_SIZE-1; + + if (is_kernel_text(addr) || + is_kernel_text(addr2)) + prot = PAGE_KERNEL_LARGE_EXEC; + + pages_2m++; + if (mapping_iter == 1) + set_pmd(pmd, pfn_pmd(pfn, init_prot)); + else + set_pmd(pmd, pfn_pmd(pfn, prot)); + + pfn += PTRS_PER_PTE; + continue; + } + pte = one_page_table_init(pmd); + + pte_ofs = pte_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET); + pte += pte_ofs; + for (; pte_ofs < PTRS_PER_PTE && pfn < end_pfn; + pte++, pfn++, pte_ofs++, addr += PAGE_SIZE) { + pgprot_t prot = PAGE_KERNEL; + /* + * first pass will use the same initial + * identity mapping attribute. + */ + pgprot_t init_prot = __pgprot(PTE_IDENT_ATTR); + + if (is_kernel_text(addr)) + prot = PAGE_KERNEL_EXEC; + + pages_4k++; + if (mapping_iter == 1) { + set_pte(pte, pfn_pte(pfn, init_prot)); + last_map_addr = (pfn << PAGE_SHIFT) + PAGE_SIZE; + } else + set_pte(pte, pfn_pte(pfn, prot)); + } + } + } + if (mapping_iter == 1) { + /* + * update direct mapping page count only in the first + * iteration. + */ + update_page_count(PG_LEVEL_2M, pages_2m); + update_page_count(PG_LEVEL_4K, pages_4k); + + /* + * local global flush tlb, which will flush the previous + * mappings present in both small and large page TLB's. + */ + __flush_tlb_all(); + + /* + * Second iteration will set the actual desired PTE attributes. + */ + mapping_iter = 2; + goto repeat; + } + return last_map_addr; +} + +pte_t *kmap_pte; + +static inline pte_t *kmap_get_fixmap_pte(unsigned long vaddr) +{ + pgd_t *pgd = pgd_offset_k(vaddr); + p4d_t *p4d = p4d_offset(pgd, vaddr); + pud_t *pud = pud_offset(p4d, vaddr); + pmd_t *pmd = pmd_offset(pud, vaddr); + return pte_offset_kernel(pmd, vaddr); +} + +static void __init kmap_init(void) +{ + unsigned long kmap_vstart; + + /* + * Cache the first kmap pte: + */ + kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN); + kmap_pte = kmap_get_fixmap_pte(kmap_vstart); +} + +#ifdef CONFIG_HIGHMEM +static void __init permanent_kmaps_init(pgd_t *pgd_base) +{ + unsigned long vaddr; + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + vaddr = PKMAP_BASE; + page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base); + + pgd = swapper_pg_dir + pgd_index(vaddr); + p4d = p4d_offset(pgd, vaddr); + pud = pud_offset(p4d, vaddr); + pmd = pmd_offset(pud, vaddr); + pte = pte_offset_kernel(pmd, vaddr); + pkmap_page_table = pte; +} + +void __init add_highpages_with_active_regions(int nid, + unsigned long start_pfn, unsigned long end_pfn) +{ + phys_addr_t start, end; + u64 i; + + for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &start, &end, NULL) { + unsigned long pfn = clamp_t(unsigned long, PFN_UP(start), + start_pfn, end_pfn); + unsigned long e_pfn = clamp_t(unsigned long, PFN_DOWN(end), + start_pfn, end_pfn); + for ( ; pfn < e_pfn; pfn++) + if (pfn_valid(pfn)) + free_highmem_page(pfn_to_page(pfn)); + } +} +#else +static inline void permanent_kmaps_init(pgd_t *pgd_base) +{ +} +#endif /* CONFIG_HIGHMEM */ + +void __init sync_initial_page_table(void) +{ + clone_pgd_range(initial_page_table + KERNEL_PGD_BOUNDARY, + swapper_pg_dir + KERNEL_PGD_BOUNDARY, + KERNEL_PGD_PTRS); + + /* + * sync back low identity map too. It is used for example + * in the 32-bit EFI stub. + */ + clone_pgd_range(initial_page_table, + swapper_pg_dir + KERNEL_PGD_BOUNDARY, + min(KERNEL_PGD_PTRS, KERNEL_PGD_BOUNDARY)); +} + +void __init native_pagetable_init(void) +{ + unsigned long pfn, va; + pgd_t *pgd, *base = swapper_pg_dir; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + /* + * Remove any mappings which extend past the end of physical + * memory from the boot time page table. + * In virtual address space, we should have at least two pages + * from VMALLOC_END to pkmap or fixmap according to VMALLOC_END + * definition. And max_low_pfn is set to VMALLOC_END physical + * address. If initial memory mapping is doing right job, we + * should have pte used near max_low_pfn or one pmd is not present. + */ + for (pfn = max_low_pfn; pfn < 1<<(32-PAGE_SHIFT); pfn++) { + va = PAGE_OFFSET + (pfn<<PAGE_SHIFT); + pgd = base + pgd_index(va); + if (!pgd_present(*pgd)) + break; + + p4d = p4d_offset(pgd, va); + pud = pud_offset(p4d, va); + pmd = pmd_offset(pud, va); + if (!pmd_present(*pmd)) + break; + + /* should not be large page here */ + if (pmd_large(*pmd)) { + pr_warn("try to clear pte for ram above max_low_pfn: pfn: %lx pmd: %p pmd phys: %lx, but pmd is big page and is not using pte !\n", + pfn, pmd, __pa(pmd)); + BUG_ON(1); + } + + pte = pte_offset_kernel(pmd, va); + if (!pte_present(*pte)) + break; + + printk(KERN_DEBUG "clearing pte for ram above max_low_pfn: pfn: %lx pmd: %p pmd phys: %lx pte: %p pte phys: %lx\n", + pfn, pmd, __pa(pmd), pte, __pa(pte)); + pte_clear(NULL, va, pte); + } + paravirt_alloc_pmd(&init_mm, __pa(base) >> PAGE_SHIFT); + paging_init(); +} + +/* + * Build a proper pagetable for the kernel mappings. Up until this + * point, we've been running on some set of pagetables constructed by + * the boot process. + * + * If we're booting on native hardware, this will be a pagetable + * constructed in arch/x86/kernel/head_32.S. The root of the + * pagetable will be swapper_pg_dir. + * + * If we're booting paravirtualized under a hypervisor, then there are + * more options: we may already be running PAE, and the pagetable may + * or may not be based in swapper_pg_dir. In any case, + * paravirt_pagetable_init() will set up swapper_pg_dir + * appropriately for the rest of the initialization to work. + * + * In general, pagetable_init() assumes that the pagetable may already + * be partially populated, and so it avoids stomping on any existing + * mappings. + */ +void __init early_ioremap_page_table_range_init(void) +{ + pgd_t *pgd_base = swapper_pg_dir; + unsigned long vaddr, end; + + /* + * Fixed mappings, only the page table structure has to be + * created - mappings will be set by set_fixmap(): + */ + vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK; + end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK; + page_table_range_init(vaddr, end, pgd_base); + early_ioremap_reset(); +} + +static void __init pagetable_init(void) +{ + pgd_t *pgd_base = swapper_pg_dir; + + permanent_kmaps_init(pgd_base); +} + +#define DEFAULT_PTE_MASK ~(_PAGE_NX | _PAGE_GLOBAL) +/* Bits supported by the hardware: */ +pteval_t __supported_pte_mask __read_mostly = DEFAULT_PTE_MASK; +/* Bits allowed in normal kernel mappings: */ +pteval_t __default_kernel_pte_mask __read_mostly = DEFAULT_PTE_MASK; +EXPORT_SYMBOL_GPL(__supported_pte_mask); +/* Used in PAGE_KERNEL_* macros which are reasonably used out-of-tree: */ +EXPORT_SYMBOL(__default_kernel_pte_mask); + +/* user-defined highmem size */ +static unsigned int highmem_pages = -1; + +/* + * highmem=size forces highmem to be exactly 'size' bytes. + * This works even on boxes that have no highmem otherwise. + * This also works to reduce highmem size on bigger boxes. + */ +static int __init parse_highmem(char *arg) +{ + if (!arg) + return -EINVAL; + + highmem_pages = memparse(arg, &arg) >> PAGE_SHIFT; + return 0; +} +early_param("highmem", parse_highmem); + +#define MSG_HIGHMEM_TOO_BIG \ + "highmem size (%luMB) is bigger than pages available (%luMB)!\n" + +#define MSG_LOWMEM_TOO_SMALL \ + "highmem size (%luMB) results in <64MB lowmem, ignoring it!\n" +/* + * All of RAM fits into lowmem - but if user wants highmem + * artificially via the highmem=x boot parameter then create + * it: + */ +static void __init lowmem_pfn_init(void) +{ + /* max_low_pfn is 0, we already have early_res support */ + max_low_pfn = max_pfn; + + if (highmem_pages == -1) + highmem_pages = 0; +#ifdef CONFIG_HIGHMEM + if (highmem_pages >= max_pfn) { + printk(KERN_ERR MSG_HIGHMEM_TOO_BIG, + pages_to_mb(highmem_pages), pages_to_mb(max_pfn)); + highmem_pages = 0; + } + if (highmem_pages) { + if (max_low_pfn - highmem_pages < 64*1024*1024/PAGE_SIZE) { + printk(KERN_ERR MSG_LOWMEM_TOO_SMALL, + pages_to_mb(highmem_pages)); + highmem_pages = 0; + } + max_low_pfn -= highmem_pages; + } +#else + if (highmem_pages) + printk(KERN_ERR "ignoring highmem size on non-highmem kernel!\n"); +#endif +} + +#define MSG_HIGHMEM_TOO_SMALL \ + "only %luMB highmem pages available, ignoring highmem size of %luMB!\n" + +#define MSG_HIGHMEM_TRIMMED \ + "Warning: only 4GB will be used. Use a HIGHMEM64G enabled kernel!\n" +/* + * We have more RAM than fits into lowmem - we try to put it into + * highmem, also taking the highmem=x boot parameter into account: + */ +static void __init highmem_pfn_init(void) +{ + max_low_pfn = MAXMEM_PFN; + + if (highmem_pages == -1) + highmem_pages = max_pfn - MAXMEM_PFN; + + if (highmem_pages + MAXMEM_PFN < max_pfn) + max_pfn = MAXMEM_PFN + highmem_pages; + + if (highmem_pages + MAXMEM_PFN > max_pfn) { + printk(KERN_WARNING MSG_HIGHMEM_TOO_SMALL, + pages_to_mb(max_pfn - MAXMEM_PFN), + pages_to_mb(highmem_pages)); + highmem_pages = 0; + } +#ifndef CONFIG_HIGHMEM + /* Maximum memory usable is what is directly addressable */ + printk(KERN_WARNING "Warning only %ldMB will be used.\n", MAXMEM>>20); + if (max_pfn > MAX_NONPAE_PFN) + printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n"); + else + printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n"); + max_pfn = MAXMEM_PFN; +#else /* !CONFIG_HIGHMEM */ +#ifndef CONFIG_HIGHMEM64G + if (max_pfn > MAX_NONPAE_PFN) { + max_pfn = MAX_NONPAE_PFN; + printk(KERN_WARNING MSG_HIGHMEM_TRIMMED); + } +#endif /* !CONFIG_HIGHMEM64G */ +#endif /* !CONFIG_HIGHMEM */ +} + +/* + * Determine low and high memory ranges: + */ +void __init find_low_pfn_range(void) +{ + /* it could update max_pfn */ + + if (max_pfn <= MAXMEM_PFN) + lowmem_pfn_init(); + else + highmem_pfn_init(); +} + +#ifndef CONFIG_NEED_MULTIPLE_NODES +void __init initmem_init(void) +{ +#ifdef CONFIG_HIGHMEM + highstart_pfn = highend_pfn = max_pfn; + if (max_pfn > max_low_pfn) + highstart_pfn = max_low_pfn; + printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", + pages_to_mb(highend_pfn - highstart_pfn)); + high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1; +#else + high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1; +#endif + + memblock_set_node(0, PHYS_ADDR_MAX, &memblock.memory, 0); + sparse_memory_present_with_active_regions(0); + +#ifdef CONFIG_FLATMEM + max_mapnr = IS_ENABLED(CONFIG_HIGHMEM) ? highend_pfn : max_low_pfn; +#endif + __vmalloc_start_set = true; + + printk(KERN_NOTICE "%ldMB LOWMEM available.\n", + pages_to_mb(max_low_pfn)); + + setup_bootmem_allocator(); +} +#endif /* !CONFIG_NEED_MULTIPLE_NODES */ + +void __init setup_bootmem_allocator(void) +{ + printk(KERN_INFO " mapped low ram: 0 - %08lx\n", + max_pfn_mapped<<PAGE_SHIFT); + printk(KERN_INFO " low ram: 0 - %08lx\n", max_low_pfn<<PAGE_SHIFT); +} + +/* + * paging_init() sets up the page tables - note that the first 8MB are + * already mapped by head.S. + * + * This routines also unmaps the page at virtual kernel address 0, so + * that we can trap those pesky NULL-reference errors in the kernel. + */ +void __init paging_init(void) +{ + pagetable_init(); + + __flush_tlb_all(); + + kmap_init(); + + /* + * NOTE: at this point the bootmem allocator is fully available. + */ + olpc_dt_build_devicetree(); + sparse_memory_present_with_active_regions(MAX_NUMNODES); + sparse_init(); + zone_sizes_init(); +} + +/* + * Test if the WP bit works in supervisor mode. It isn't supported on 386's + * and also on some strange 486's. All 586+'s are OK. This used to involve + * black magic jumps to work around some nasty CPU bugs, but fortunately the + * switch to using exceptions got rid of all that. + */ +static void __init test_wp_bit(void) +{ + char z = 0; + + printk(KERN_INFO "Checking if this processor honours the WP bit even in supervisor mode..."); + + __set_fixmap(FIX_WP_TEST, __pa_symbol(empty_zero_page), PAGE_KERNEL_RO); + + if (probe_kernel_write((char *)fix_to_virt(FIX_WP_TEST), &z, 1)) { + clear_fixmap(FIX_WP_TEST); + printk(KERN_CONT "Ok.\n"); + return; + } + + printk(KERN_CONT "No.\n"); + panic("Linux doesn't support CPUs with broken WP."); +} + +void __init mem_init(void) +{ + pci_iommu_alloc(); + +#ifdef CONFIG_FLATMEM + BUG_ON(!mem_map); +#endif + /* + * With CONFIG_DEBUG_PAGEALLOC initialization of highmem pages has to + * be done before free_all_bootmem(). Memblock use free low memory for + * temporary data (see find_range_array()) and for this purpose can use + * pages that was already passed to the buddy allocator, hence marked as + * not accessible in the page tables when compiled with + * CONFIG_DEBUG_PAGEALLOC. Otherwise order of initialization is not + * important here. + */ + set_highmem_pages_init(); + + /* this will put all low memory onto the freelists */ + free_all_bootmem(); + + after_bootmem = 1; + x86_init.hyper.init_after_bootmem(); + + mem_init_print_info(NULL); + printk(KERN_INFO "virtual kernel memory layout:\n" + " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n" + " cpu_entry : 0x%08lx - 0x%08lx (%4ld kB)\n" +#ifdef CONFIG_HIGHMEM + " pkmap : 0x%08lx - 0x%08lx (%4ld kB)\n" +#endif + " vmalloc : 0x%08lx - 0x%08lx (%4ld MB)\n" + " lowmem : 0x%08lx - 0x%08lx (%4ld MB)\n" + " .init : 0x%08lx - 0x%08lx (%4ld kB)\n" + " .data : 0x%08lx - 0x%08lx (%4ld kB)\n" + " .text : 0x%08lx - 0x%08lx (%4ld kB)\n", + FIXADDR_START, FIXADDR_TOP, + (FIXADDR_TOP - FIXADDR_START) >> 10, + + CPU_ENTRY_AREA_BASE, + CPU_ENTRY_AREA_BASE + CPU_ENTRY_AREA_MAP_SIZE, + CPU_ENTRY_AREA_MAP_SIZE >> 10, + +#ifdef CONFIG_HIGHMEM + PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE, + (LAST_PKMAP*PAGE_SIZE) >> 10, +#endif + + VMALLOC_START, VMALLOC_END, + (VMALLOC_END - VMALLOC_START) >> 20, + + (unsigned long)__va(0), (unsigned long)high_memory, + ((unsigned long)high_memory - (unsigned long)__va(0)) >> 20, + + (unsigned long)&__init_begin, (unsigned long)&__init_end, + ((unsigned long)&__init_end - + (unsigned long)&__init_begin) >> 10, + + (unsigned long)&_etext, (unsigned long)&_edata, + ((unsigned long)&_edata - (unsigned long)&_etext) >> 10, + + (unsigned long)&_text, (unsigned long)&_etext, + ((unsigned long)&_etext - (unsigned long)&_text) >> 10); + + /* + * Check boundaries twice: Some fundamental inconsistencies can + * be detected at build time already. + */ +#define __FIXADDR_TOP (-PAGE_SIZE) +#ifdef CONFIG_HIGHMEM + BUILD_BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE > FIXADDR_START); + BUILD_BUG_ON(VMALLOC_END > PKMAP_BASE); +#endif +#define high_memory (-128UL << 20) + BUILD_BUG_ON(VMALLOC_START >= VMALLOC_END); +#undef high_memory +#undef __FIXADDR_TOP + +#ifdef CONFIG_HIGHMEM + BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE > FIXADDR_START); + BUG_ON(VMALLOC_END > PKMAP_BASE); +#endif + BUG_ON(VMALLOC_START >= VMALLOC_END); + BUG_ON((unsigned long)high_memory > VMALLOC_START); + + test_wp_bit(); +} + +#ifdef CONFIG_MEMORY_HOTPLUG +int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap, + bool want_memblock) +{ + unsigned long start_pfn = start >> PAGE_SHIFT; + unsigned long nr_pages = size >> PAGE_SHIFT; + + return __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock); +} + +void arch_remove_memory(int nid, u64 start, u64 size, + struct vmem_altmap *altmap) +{ + unsigned long start_pfn = start >> PAGE_SHIFT; + unsigned long nr_pages = size >> PAGE_SHIFT; + + __remove_pages(start_pfn, nr_pages, altmap); +} +#endif + +int kernel_set_to_readonly __read_mostly; + +void set_kernel_text_rw(void) +{ + unsigned long start = PFN_ALIGN(_text); + unsigned long size = PFN_ALIGN(_etext) - start; + + if (!kernel_set_to_readonly) + return; + + pr_debug("Set kernel text: %lx - %lx for read write\n", + start, start+size); + + set_pages_rw(virt_to_page(start), size >> PAGE_SHIFT); +} + +void set_kernel_text_ro(void) +{ + unsigned long start = PFN_ALIGN(_text); + unsigned long size = PFN_ALIGN(_etext) - start; + + if (!kernel_set_to_readonly) + return; + + pr_debug("Set kernel text: %lx - %lx for read only\n", + start, start+size); + + set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); +} + +static void mark_nxdata_nx(void) +{ + /* + * When this called, init has already been executed and released, + * so everything past _etext should be NX. + */ + unsigned long start = PFN_ALIGN(_etext); + /* + * This comes from is_kernel_text upper limit. Also HPAGE where used: + */ + unsigned long size = (((unsigned long)__init_end + HPAGE_SIZE) & HPAGE_MASK) - start; + + if (__supported_pte_mask & _PAGE_NX) + printk(KERN_INFO "NX-protecting the kernel data: %luk\n", size >> 10); + set_pages_nx(virt_to_page(start), size >> PAGE_SHIFT); +} + +void mark_rodata_ro(void) +{ + unsigned long start = PFN_ALIGN(_text); + unsigned long size = PFN_ALIGN(_etext) - start; + + set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); + printk(KERN_INFO "Write protecting the kernel text: %luk\n", + size >> 10); + + kernel_set_to_readonly = 1; + +#ifdef CONFIG_CPA_DEBUG + printk(KERN_INFO "Testing CPA: Reverting %lx-%lx\n", + start, start+size); + set_pages_rw(virt_to_page(start), size>>PAGE_SHIFT); + + printk(KERN_INFO "Testing CPA: write protecting again\n"); + set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT); +#endif + + start += size; + size = (unsigned long)__end_rodata - start; + set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); + printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", + size >> 10); + +#ifdef CONFIG_CPA_DEBUG + printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, start + size); + set_pages_rw(virt_to_page(start), size >> PAGE_SHIFT); + + printk(KERN_INFO "Testing CPA: write protecting again\n"); + set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); +#endif + mark_nxdata_nx(); + if (__supported_pte_mask & _PAGE_NX) + debug_checkwx(); +} diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c new file mode 100644 index 000000000..4b25a1ad1 --- /dev/null +++ b/arch/x86/mm/init_64.c @@ -0,0 +1,1544 @@ +/* + * linux/arch/x86_64/mm/init.c + * + * Copyright (C) 1995 Linus Torvalds + * Copyright (C) 2000 Pavel Machek <pavel@ucw.cz> + * Copyright (C) 2002,2003 Andi Kleen <ak@suse.de> + */ + +#include <linux/signal.h> +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/string.h> +#include <linux/types.h> +#include <linux/ptrace.h> +#include <linux/mman.h> +#include <linux/mm.h> +#include <linux/swap.h> +#include <linux/smp.h> +#include <linux/init.h> +#include <linux/initrd.h> +#include <linux/pagemap.h> +#include <linux/bootmem.h> +#include <linux/memblock.h> +#include <linux/proc_fs.h> +#include <linux/pci.h> +#include <linux/pfn.h> +#include <linux/poison.h> +#include <linux/dma-mapping.h> +#include <linux/memory.h> +#include <linux/memory_hotplug.h> +#include <linux/memremap.h> +#include <linux/nmi.h> +#include <linux/gfp.h> +#include <linux/kcore.h> + +#include <asm/processor.h> +#include <asm/bios_ebda.h> +#include <linux/uaccess.h> +#include <asm/pgtable.h> +#include <asm/pgalloc.h> +#include <asm/dma.h> +#include <asm/fixmap.h> +#include <asm/e820/api.h> +#include <asm/apic.h> +#include <asm/tlb.h> +#include <asm/mmu_context.h> +#include <asm/proto.h> +#include <asm/smp.h> +#include <asm/sections.h> +#include <asm/kdebug.h> +#include <asm/numa.h> +#include <asm/set_memory.h> +#include <asm/init.h> +#include <asm/uv/uv.h> +#include <asm/setup.h> + +#include "mm_internal.h" + +#include "ident_map.c" + +/* + * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the + * physical space so we can cache the place of the first one and move + * around without checking the pgd every time. + */ + +/* Bits supported by the hardware: */ +pteval_t __supported_pte_mask __read_mostly = ~0; +/* Bits allowed in normal kernel mappings: */ +pteval_t __default_kernel_pte_mask __read_mostly = ~0; +EXPORT_SYMBOL_GPL(__supported_pte_mask); +/* Used in PAGE_KERNEL_* macros which are reasonably used out-of-tree: */ +EXPORT_SYMBOL(__default_kernel_pte_mask); + +int force_personality32; + +/* + * noexec32=on|off + * Control non executable heap for 32bit processes. + * To control the stack too use noexec=off + * + * on PROT_READ does not imply PROT_EXEC for 32-bit processes (default) + * off PROT_READ implies PROT_EXEC + */ +static int __init nonx32_setup(char *str) +{ + if (!strcmp(str, "on")) + force_personality32 &= ~READ_IMPLIES_EXEC; + else if (!strcmp(str, "off")) + force_personality32 |= READ_IMPLIES_EXEC; + return 1; +} +__setup("noexec32=", nonx32_setup); + +static void sync_global_pgds_l5(unsigned long start, unsigned long end) +{ + unsigned long addr; + + for (addr = start; addr <= end; addr = ALIGN(addr + 1, PGDIR_SIZE)) { + const pgd_t *pgd_ref = pgd_offset_k(addr); + struct page *page; + + /* Check for overflow */ + if (addr < start) + break; + + if (pgd_none(*pgd_ref)) + continue; + + spin_lock(&pgd_lock); + list_for_each_entry(page, &pgd_list, lru) { + pgd_t *pgd; + spinlock_t *pgt_lock; + + pgd = (pgd_t *)page_address(page) + pgd_index(addr); + /* the pgt_lock only for Xen */ + pgt_lock = &pgd_page_get_mm(page)->page_table_lock; + spin_lock(pgt_lock); + + if (!pgd_none(*pgd_ref) && !pgd_none(*pgd)) + BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); + + if (pgd_none(*pgd)) + set_pgd(pgd, *pgd_ref); + + spin_unlock(pgt_lock); + } + spin_unlock(&pgd_lock); + } +} + +static void sync_global_pgds_l4(unsigned long start, unsigned long end) +{ + unsigned long addr; + + for (addr = start; addr <= end; addr = ALIGN(addr + 1, PGDIR_SIZE)) { + pgd_t *pgd_ref = pgd_offset_k(addr); + const p4d_t *p4d_ref; + struct page *page; + + /* + * With folded p4d, pgd_none() is always false, we need to + * handle synchonization on p4d level. + */ + MAYBE_BUILD_BUG_ON(pgd_none(*pgd_ref)); + p4d_ref = p4d_offset(pgd_ref, addr); + + if (p4d_none(*p4d_ref)) + continue; + + spin_lock(&pgd_lock); + list_for_each_entry(page, &pgd_list, lru) { + pgd_t *pgd; + p4d_t *p4d; + spinlock_t *pgt_lock; + + pgd = (pgd_t *)page_address(page) + pgd_index(addr); + p4d = p4d_offset(pgd, addr); + /* the pgt_lock only for Xen */ + pgt_lock = &pgd_page_get_mm(page)->page_table_lock; + spin_lock(pgt_lock); + + if (!p4d_none(*p4d_ref) && !p4d_none(*p4d)) + BUG_ON(p4d_page_vaddr(*p4d) + != p4d_page_vaddr(*p4d_ref)); + + if (p4d_none(*p4d)) + set_p4d(p4d, *p4d_ref); + + spin_unlock(pgt_lock); + } + spin_unlock(&pgd_lock); + } +} + +/* + * When memory was added make sure all the processes MM have + * suitable PGD entries in the local PGD level page. + */ +void sync_global_pgds(unsigned long start, unsigned long end) +{ + if (pgtable_l5_enabled()) + sync_global_pgds_l5(start, end); + else + sync_global_pgds_l4(start, end); +} + +/* + * NOTE: This function is marked __ref because it calls __init function + * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0. + */ +static __ref void *spp_getpage(void) +{ + void *ptr; + + if (after_bootmem) + ptr = (void *) get_zeroed_page(GFP_ATOMIC); + else + ptr = alloc_bootmem_pages(PAGE_SIZE); + + if (!ptr || ((unsigned long)ptr & ~PAGE_MASK)) { + panic("set_pte_phys: cannot allocate page data %s\n", + after_bootmem ? "after bootmem" : ""); + } + + pr_debug("spp_getpage %p\n", ptr); + + return ptr; +} + +static p4d_t *fill_p4d(pgd_t *pgd, unsigned long vaddr) +{ + if (pgd_none(*pgd)) { + p4d_t *p4d = (p4d_t *)spp_getpage(); + pgd_populate(&init_mm, pgd, p4d); + if (p4d != p4d_offset(pgd, 0)) + printk(KERN_ERR "PAGETABLE BUG #00! %p <-> %p\n", + p4d, p4d_offset(pgd, 0)); + } + return p4d_offset(pgd, vaddr); +} + +static pud_t *fill_pud(p4d_t *p4d, unsigned long vaddr) +{ + if (p4d_none(*p4d)) { + pud_t *pud = (pud_t *)spp_getpage(); + p4d_populate(&init_mm, p4d, pud); + if (pud != pud_offset(p4d, 0)) + printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n", + pud, pud_offset(p4d, 0)); + } + return pud_offset(p4d, vaddr); +} + +static pmd_t *fill_pmd(pud_t *pud, unsigned long vaddr) +{ + if (pud_none(*pud)) { + pmd_t *pmd = (pmd_t *) spp_getpage(); + pud_populate(&init_mm, pud, pmd); + if (pmd != pmd_offset(pud, 0)) + printk(KERN_ERR "PAGETABLE BUG #02! %p <-> %p\n", + pmd, pmd_offset(pud, 0)); + } + return pmd_offset(pud, vaddr); +} + +static pte_t *fill_pte(pmd_t *pmd, unsigned long vaddr) +{ + if (pmd_none(*pmd)) { + pte_t *pte = (pte_t *) spp_getpage(); + pmd_populate_kernel(&init_mm, pmd, pte); + if (pte != pte_offset_kernel(pmd, 0)) + printk(KERN_ERR "PAGETABLE BUG #03!\n"); + } + return pte_offset_kernel(pmd, vaddr); +} + +static void __set_pte_vaddr(pud_t *pud, unsigned long vaddr, pte_t new_pte) +{ + pmd_t *pmd = fill_pmd(pud, vaddr); + pte_t *pte = fill_pte(pmd, vaddr); + + set_pte(pte, new_pte); + + /* + * It's enough to flush this one mapping. + * (PGE mappings get flushed as well) + */ + __flush_tlb_one_kernel(vaddr); +} + +void set_pte_vaddr_p4d(p4d_t *p4d_page, unsigned long vaddr, pte_t new_pte) +{ + p4d_t *p4d = p4d_page + p4d_index(vaddr); + pud_t *pud = fill_pud(p4d, vaddr); + + __set_pte_vaddr(pud, vaddr, new_pte); +} + +void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte) +{ + pud_t *pud = pud_page + pud_index(vaddr); + + __set_pte_vaddr(pud, vaddr, new_pte); +} + +void set_pte_vaddr(unsigned long vaddr, pte_t pteval) +{ + pgd_t *pgd; + p4d_t *p4d_page; + + pr_debug("set_pte_vaddr %lx to %lx\n", vaddr, native_pte_val(pteval)); + + pgd = pgd_offset_k(vaddr); + if (pgd_none(*pgd)) { + printk(KERN_ERR + "PGD FIXMAP MISSING, it should be setup in head.S!\n"); + return; + } + + p4d_page = p4d_offset(pgd, 0); + set_pte_vaddr_p4d(p4d_page, vaddr, pteval); +} + +pmd_t * __init populate_extra_pmd(unsigned long vaddr) +{ + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + + pgd = pgd_offset_k(vaddr); + p4d = fill_p4d(pgd, vaddr); + pud = fill_pud(p4d, vaddr); + return fill_pmd(pud, vaddr); +} + +pte_t * __init populate_extra_pte(unsigned long vaddr) +{ + pmd_t *pmd; + + pmd = populate_extra_pmd(vaddr); + return fill_pte(pmd, vaddr); +} + +/* + * Create large page table mappings for a range of physical addresses. + */ +static void __init __init_extra_mapping(unsigned long phys, unsigned long size, + enum page_cache_mode cache) +{ + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + pgprot_t prot; + + pgprot_val(prot) = pgprot_val(PAGE_KERNEL_LARGE) | + pgprot_val(pgprot_4k_2_large(cachemode2pgprot(cache))); + BUG_ON((phys & ~PMD_MASK) || (size & ~PMD_MASK)); + for (; size; phys += PMD_SIZE, size -= PMD_SIZE) { + pgd = pgd_offset_k((unsigned long)__va(phys)); + if (pgd_none(*pgd)) { + p4d = (p4d_t *) spp_getpage(); + set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE | + _PAGE_USER)); + } + p4d = p4d_offset(pgd, (unsigned long)__va(phys)); + if (p4d_none(*p4d)) { + pud = (pud_t *) spp_getpage(); + set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE | + _PAGE_USER)); + } + pud = pud_offset(p4d, (unsigned long)__va(phys)); + if (pud_none(*pud)) { + pmd = (pmd_t *) spp_getpage(); + set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | + _PAGE_USER)); + } + pmd = pmd_offset(pud, phys); + BUG_ON(!pmd_none(*pmd)); + set_pmd(pmd, __pmd(phys | pgprot_val(prot))); + } +} + +void __init init_extra_mapping_wb(unsigned long phys, unsigned long size) +{ + __init_extra_mapping(phys, size, _PAGE_CACHE_MODE_WB); +} + +void __init init_extra_mapping_uc(unsigned long phys, unsigned long size) +{ + __init_extra_mapping(phys, size, _PAGE_CACHE_MODE_UC); +} + +/* + * The head.S code sets up the kernel high mapping: + * + * from __START_KERNEL_map to __START_KERNEL_map + size (== _end-_text) + * + * phys_base holds the negative offset to the kernel, which is added + * to the compile time generated pmds. This results in invalid pmds up + * to the point where we hit the physaddr 0 mapping. + * + * We limit the mappings to the region from _text to _brk_end. _brk_end + * is rounded up to the 2MB boundary. This catches the invalid pmds as + * well, as they are located before _text: + */ +void __init cleanup_highmap(void) +{ + unsigned long vaddr = __START_KERNEL_map; + unsigned long vaddr_end = __START_KERNEL_map + KERNEL_IMAGE_SIZE; + unsigned long end = roundup((unsigned long)_brk_end, PMD_SIZE) - 1; + pmd_t *pmd = level2_kernel_pgt; + + /* + * Native path, max_pfn_mapped is not set yet. + * Xen has valid max_pfn_mapped set in + * arch/x86/xen/mmu.c:xen_setup_kernel_pagetable(). + */ + if (max_pfn_mapped) + vaddr_end = __START_KERNEL_map + (max_pfn_mapped << PAGE_SHIFT); + + for (; vaddr + PMD_SIZE - 1 < vaddr_end; pmd++, vaddr += PMD_SIZE) { + if (pmd_none(*pmd)) + continue; + if (vaddr < (unsigned long) _text || vaddr > end) + set_pmd(pmd, __pmd(0)); + } +} + +/* + * Create PTE level page table mapping for physical addresses. + * It returns the last physical address mapped. + */ +static unsigned long __meminit +phys_pte_init(pte_t *pte_page, unsigned long paddr, unsigned long paddr_end, + pgprot_t prot) +{ + unsigned long pages = 0, paddr_next; + unsigned long paddr_last = paddr_end; + pte_t *pte; + int i; + + pte = pte_page + pte_index(paddr); + i = pte_index(paddr); + + for (; i < PTRS_PER_PTE; i++, paddr = paddr_next, pte++) { + paddr_next = (paddr & PAGE_MASK) + PAGE_SIZE; + if (paddr >= paddr_end) { + if (!after_bootmem && + !e820__mapped_any(paddr & PAGE_MASK, paddr_next, + E820_TYPE_RAM) && + !e820__mapped_any(paddr & PAGE_MASK, paddr_next, + E820_TYPE_RESERVED_KERN)) + set_pte(pte, __pte(0)); + continue; + } + + /* + * We will re-use the existing mapping. + * Xen for example has some special requirements, like mapping + * pagetable pages as RO. So assume someone who pre-setup + * these mappings are more intelligent. + */ + if (!pte_none(*pte)) { + if (!after_bootmem) + pages++; + continue; + } + + if (0) + pr_info(" pte=%p addr=%lx pte=%016lx\n", pte, paddr, + pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL).pte); + pages++; + set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, prot)); + paddr_last = (paddr & PAGE_MASK) + PAGE_SIZE; + } + + update_page_count(PG_LEVEL_4K, pages); + + return paddr_last; +} + +/* + * Create PMD level page table mapping for physical addresses. The virtual + * and physical address have to be aligned at this level. + * It returns the last physical address mapped. + */ +static unsigned long __meminit +phys_pmd_init(pmd_t *pmd_page, unsigned long paddr, unsigned long paddr_end, + unsigned long page_size_mask, pgprot_t prot) +{ + unsigned long pages = 0, paddr_next; + unsigned long paddr_last = paddr_end; + + int i = pmd_index(paddr); + + for (; i < PTRS_PER_PMD; i++, paddr = paddr_next) { + pmd_t *pmd = pmd_page + pmd_index(paddr); + pte_t *pte; + pgprot_t new_prot = prot; + + paddr_next = (paddr & PMD_MASK) + PMD_SIZE; + if (paddr >= paddr_end) { + if (!after_bootmem && + !e820__mapped_any(paddr & PMD_MASK, paddr_next, + E820_TYPE_RAM) && + !e820__mapped_any(paddr & PMD_MASK, paddr_next, + E820_TYPE_RESERVED_KERN)) + set_pmd(pmd, __pmd(0)); + continue; + } + + if (!pmd_none(*pmd)) { + if (!pmd_large(*pmd)) { + spin_lock(&init_mm.page_table_lock); + pte = (pte_t *)pmd_page_vaddr(*pmd); + paddr_last = phys_pte_init(pte, paddr, + paddr_end, prot); + spin_unlock(&init_mm.page_table_lock); + continue; + } + /* + * If we are ok with PG_LEVEL_2M mapping, then we will + * use the existing mapping, + * + * Otherwise, we will split the large page mapping but + * use the same existing protection bits except for + * large page, so that we don't violate Intel's TLB + * Application note (317080) which says, while changing + * the page sizes, new and old translations should + * not differ with respect to page frame and + * attributes. + */ + if (page_size_mask & (1 << PG_LEVEL_2M)) { + if (!after_bootmem) + pages++; + paddr_last = paddr_next; + continue; + } + new_prot = pte_pgprot(pte_clrhuge(*(pte_t *)pmd)); + } + + if (page_size_mask & (1<<PG_LEVEL_2M)) { + pages++; + spin_lock(&init_mm.page_table_lock); + set_pte((pte_t *)pmd, + pfn_pte((paddr & PMD_MASK) >> PAGE_SHIFT, + __pgprot(pgprot_val(prot) | _PAGE_PSE))); + spin_unlock(&init_mm.page_table_lock); + paddr_last = paddr_next; + continue; + } + + pte = alloc_low_page(); + paddr_last = phys_pte_init(pte, paddr, paddr_end, new_prot); + + spin_lock(&init_mm.page_table_lock); + pmd_populate_kernel(&init_mm, pmd, pte); + spin_unlock(&init_mm.page_table_lock); + } + update_page_count(PG_LEVEL_2M, pages); + return paddr_last; +} + +/* + * Create PUD level page table mapping for physical addresses. The virtual + * and physical address do not have to be aligned at this level. KASLR can + * randomize virtual addresses up to this level. + * It returns the last physical address mapped. + */ +static unsigned long __meminit +phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end, + unsigned long page_size_mask) +{ + unsigned long pages = 0, paddr_next; + unsigned long paddr_last = paddr_end; + unsigned long vaddr = (unsigned long)__va(paddr); + int i = pud_index(vaddr); + + for (; i < PTRS_PER_PUD; i++, paddr = paddr_next) { + pud_t *pud; + pmd_t *pmd; + pgprot_t prot = PAGE_KERNEL; + + vaddr = (unsigned long)__va(paddr); + pud = pud_page + pud_index(vaddr); + paddr_next = (paddr & PUD_MASK) + PUD_SIZE; + + if (paddr >= paddr_end) { + if (!after_bootmem && + !e820__mapped_any(paddr & PUD_MASK, paddr_next, + E820_TYPE_RAM) && + !e820__mapped_any(paddr & PUD_MASK, paddr_next, + E820_TYPE_RESERVED_KERN)) + set_pud(pud, __pud(0)); + continue; + } + + if (!pud_none(*pud)) { + if (!pud_large(*pud)) { + pmd = pmd_offset(pud, 0); + paddr_last = phys_pmd_init(pmd, paddr, + paddr_end, + page_size_mask, + prot); + continue; + } + /* + * If we are ok with PG_LEVEL_1G mapping, then we will + * use the existing mapping. + * + * Otherwise, we will split the gbpage mapping but use + * the same existing protection bits except for large + * page, so that we don't violate Intel's TLB + * Application note (317080) which says, while changing + * the page sizes, new and old translations should + * not differ with respect to page frame and + * attributes. + */ + if (page_size_mask & (1 << PG_LEVEL_1G)) { + if (!after_bootmem) + pages++; + paddr_last = paddr_next; + continue; + } + prot = pte_pgprot(pte_clrhuge(*(pte_t *)pud)); + } + + if (page_size_mask & (1<<PG_LEVEL_1G)) { + pages++; + spin_lock(&init_mm.page_table_lock); + set_pte((pte_t *)pud, + pfn_pte((paddr & PUD_MASK) >> PAGE_SHIFT, + PAGE_KERNEL_LARGE)); + spin_unlock(&init_mm.page_table_lock); + paddr_last = paddr_next; + continue; + } + + pmd = alloc_low_page(); + paddr_last = phys_pmd_init(pmd, paddr, paddr_end, + page_size_mask, prot); + + spin_lock(&init_mm.page_table_lock); + pud_populate(&init_mm, pud, pmd); + spin_unlock(&init_mm.page_table_lock); + } + + update_page_count(PG_LEVEL_1G, pages); + + return paddr_last; +} + +static unsigned long __meminit +phys_p4d_init(p4d_t *p4d_page, unsigned long paddr, unsigned long paddr_end, + unsigned long page_size_mask) +{ + unsigned long paddr_next, paddr_last = paddr_end; + unsigned long vaddr = (unsigned long)__va(paddr); + int i = p4d_index(vaddr); + + if (!pgtable_l5_enabled()) + return phys_pud_init((pud_t *) p4d_page, paddr, paddr_end, page_size_mask); + + for (; i < PTRS_PER_P4D; i++, paddr = paddr_next) { + p4d_t *p4d; + pud_t *pud; + + vaddr = (unsigned long)__va(paddr); + p4d = p4d_page + p4d_index(vaddr); + paddr_next = (paddr & P4D_MASK) + P4D_SIZE; + + if (paddr >= paddr_end) { + if (!after_bootmem && + !e820__mapped_any(paddr & P4D_MASK, paddr_next, + E820_TYPE_RAM) && + !e820__mapped_any(paddr & P4D_MASK, paddr_next, + E820_TYPE_RESERVED_KERN)) + set_p4d(p4d, __p4d(0)); + continue; + } + + if (!p4d_none(*p4d)) { + pud = pud_offset(p4d, 0); + paddr_last = phys_pud_init(pud, paddr, + paddr_end, + page_size_mask); + continue; + } + + pud = alloc_low_page(); + paddr_last = phys_pud_init(pud, paddr, paddr_end, + page_size_mask); + + spin_lock(&init_mm.page_table_lock); + p4d_populate(&init_mm, p4d, pud); + spin_unlock(&init_mm.page_table_lock); + } + + return paddr_last; +} + +/* + * Create page table mapping for the physical memory for specific physical + * addresses. The virtual and physical addresses have to be aligned on PMD level + * down. It returns the last physical address mapped. + */ +unsigned long __meminit +kernel_physical_mapping_init(unsigned long paddr_start, + unsigned long paddr_end, + unsigned long page_size_mask) +{ + bool pgd_changed = false; + unsigned long vaddr, vaddr_start, vaddr_end, vaddr_next, paddr_last; + + paddr_last = paddr_end; + vaddr = (unsigned long)__va(paddr_start); + vaddr_end = (unsigned long)__va(paddr_end); + vaddr_start = vaddr; + + for (; vaddr < vaddr_end; vaddr = vaddr_next) { + pgd_t *pgd = pgd_offset_k(vaddr); + p4d_t *p4d; + + vaddr_next = (vaddr & PGDIR_MASK) + PGDIR_SIZE; + + if (pgd_val(*pgd)) { + p4d = (p4d_t *)pgd_page_vaddr(*pgd); + paddr_last = phys_p4d_init(p4d, __pa(vaddr), + __pa(vaddr_end), + page_size_mask); + continue; + } + + p4d = alloc_low_page(); + paddr_last = phys_p4d_init(p4d, __pa(vaddr), __pa(vaddr_end), + page_size_mask); + + spin_lock(&init_mm.page_table_lock); + if (pgtable_l5_enabled()) + pgd_populate(&init_mm, pgd, p4d); + else + p4d_populate(&init_mm, p4d_offset(pgd, vaddr), (pud_t *) p4d); + spin_unlock(&init_mm.page_table_lock); + pgd_changed = true; + } + + if (pgd_changed) + sync_global_pgds(vaddr_start, vaddr_end - 1); + + return paddr_last; +} + +#ifndef CONFIG_NUMA +void __init initmem_init(void) +{ + memblock_set_node(0, PHYS_ADDR_MAX, &memblock.memory, 0); +} +#endif + +void __init paging_init(void) +{ + sparse_memory_present_with_active_regions(MAX_NUMNODES); + sparse_init(); + + /* + * clear the default setting with node 0 + * note: don't use nodes_clear here, that is really clearing when + * numa support is not compiled in, and later node_set_state + * will not set it back. + */ + node_clear_state(0, N_MEMORY); + if (N_MEMORY != N_NORMAL_MEMORY) + node_clear_state(0, N_NORMAL_MEMORY); + + zone_sizes_init(); +} + +/* + * Memory hotplug specific functions + */ +#ifdef CONFIG_MEMORY_HOTPLUG +/* + * After memory hotplug the variables max_pfn, max_low_pfn and high_memory need + * updating. + */ +static void update_end_of_memory_vars(u64 start, u64 size) +{ + unsigned long end_pfn = PFN_UP(start + size); + + if (end_pfn > max_pfn) { + max_pfn = end_pfn; + max_low_pfn = end_pfn; + high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1; + } +} + +int add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages, + struct vmem_altmap *altmap, bool want_memblock) +{ + int ret; + + ret = __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock); + WARN_ON_ONCE(ret); + + /* update max_pfn, max_low_pfn and high_memory */ + update_end_of_memory_vars(start_pfn << PAGE_SHIFT, + nr_pages << PAGE_SHIFT); + + return ret; +} + +int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap, + bool want_memblock) +{ + unsigned long start_pfn = start >> PAGE_SHIFT; + unsigned long nr_pages = size >> PAGE_SHIFT; + + init_memory_mapping(start, start + size); + + return add_pages(nid, start_pfn, nr_pages, altmap, want_memblock); +} + +#define PAGE_INUSE 0xFD + +static void __meminit free_pagetable(struct page *page, int order) +{ + unsigned long magic; + unsigned int nr_pages = 1 << order; + + /* bootmem page has reserved flag */ + if (PageReserved(page)) { + __ClearPageReserved(page); + + magic = (unsigned long)page->freelist; + if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) { + while (nr_pages--) + put_page_bootmem(page++); + } else + while (nr_pages--) + free_reserved_page(page++); + } else + free_pages((unsigned long)page_address(page), order); +} + +static void __meminit free_hugepage_table(struct page *page, + struct vmem_altmap *altmap) +{ + if (altmap) + vmem_altmap_free(altmap, PMD_SIZE / PAGE_SIZE); + else + free_pagetable(page, get_order(PMD_SIZE)); +} + +static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd) +{ + pte_t *pte; + int i; + + for (i = 0; i < PTRS_PER_PTE; i++) { + pte = pte_start + i; + if (!pte_none(*pte)) + return; + } + + /* free a pte talbe */ + free_pagetable(pmd_page(*pmd), 0); + spin_lock(&init_mm.page_table_lock); + pmd_clear(pmd); + spin_unlock(&init_mm.page_table_lock); +} + +static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud) +{ + pmd_t *pmd; + int i; + + for (i = 0; i < PTRS_PER_PMD; i++) { + pmd = pmd_start + i; + if (!pmd_none(*pmd)) + return; + } + + /* free a pmd talbe */ + free_pagetable(pud_page(*pud), 0); + spin_lock(&init_mm.page_table_lock); + pud_clear(pud); + spin_unlock(&init_mm.page_table_lock); +} + +static void __meminit free_pud_table(pud_t *pud_start, p4d_t *p4d) +{ + pud_t *pud; + int i; + + for (i = 0; i < PTRS_PER_PUD; i++) { + pud = pud_start + i; + if (!pud_none(*pud)) + return; + } + + /* free a pud talbe */ + free_pagetable(p4d_page(*p4d), 0); + spin_lock(&init_mm.page_table_lock); + p4d_clear(p4d); + spin_unlock(&init_mm.page_table_lock); +} + +static void __meminit +remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end, + bool direct) +{ + unsigned long next, pages = 0; + pte_t *pte; + void *page_addr; + phys_addr_t phys_addr; + + pte = pte_start + pte_index(addr); + for (; addr < end; addr = next, pte++) { + next = (addr + PAGE_SIZE) & PAGE_MASK; + if (next > end) + next = end; + + if (!pte_present(*pte)) + continue; + + /* + * We mapped [0,1G) memory as identity mapping when + * initializing, in arch/x86/kernel/head_64.S. These + * pagetables cannot be removed. + */ + phys_addr = pte_val(*pte) + (addr & PAGE_MASK); + if (phys_addr < (phys_addr_t)0x40000000) + return; + + if (PAGE_ALIGNED(addr) && PAGE_ALIGNED(next)) { + /* + * Do not free direct mapping pages since they were + * freed when offlining, or simplely not in use. + */ + if (!direct) + free_pagetable(pte_page(*pte), 0); + + spin_lock(&init_mm.page_table_lock); + pte_clear(&init_mm, addr, pte); + spin_unlock(&init_mm.page_table_lock); + + /* For non-direct mapping, pages means nothing. */ + pages++; + } else { + /* + * If we are here, we are freeing vmemmap pages since + * direct mapped memory ranges to be freed are aligned. + * + * If we are not removing the whole page, it means + * other page structs in this page are being used and + * we canot remove them. So fill the unused page_structs + * with 0xFD, and remove the page when it is wholly + * filled with 0xFD. + */ + memset((void *)addr, PAGE_INUSE, next - addr); + + page_addr = page_address(pte_page(*pte)); + if (!memchr_inv(page_addr, PAGE_INUSE, PAGE_SIZE)) { + free_pagetable(pte_page(*pte), 0); + + spin_lock(&init_mm.page_table_lock); + pte_clear(&init_mm, addr, pte); + spin_unlock(&init_mm.page_table_lock); + } + } + } + + /* Call free_pte_table() in remove_pmd_table(). */ + flush_tlb_all(); + if (direct) + update_page_count(PG_LEVEL_4K, -pages); +} + +static void __meminit +remove_pmd_table(pmd_t *pmd_start, unsigned long addr, unsigned long end, + bool direct, struct vmem_altmap *altmap) +{ + unsigned long next, pages = 0; + pte_t *pte_base; + pmd_t *pmd; + void *page_addr; + + pmd = pmd_start + pmd_index(addr); + for (; addr < end; addr = next, pmd++) { + next = pmd_addr_end(addr, end); + + if (!pmd_present(*pmd)) + continue; + + if (pmd_large(*pmd)) { + if (IS_ALIGNED(addr, PMD_SIZE) && + IS_ALIGNED(next, PMD_SIZE)) { + if (!direct) + free_hugepage_table(pmd_page(*pmd), + altmap); + + spin_lock(&init_mm.page_table_lock); + pmd_clear(pmd); + spin_unlock(&init_mm.page_table_lock); + pages++; + } else { + /* If here, we are freeing vmemmap pages. */ + memset((void *)addr, PAGE_INUSE, next - addr); + + page_addr = page_address(pmd_page(*pmd)); + if (!memchr_inv(page_addr, PAGE_INUSE, + PMD_SIZE)) { + free_hugepage_table(pmd_page(*pmd), + altmap); + + spin_lock(&init_mm.page_table_lock); + pmd_clear(pmd); + spin_unlock(&init_mm.page_table_lock); + } + } + + continue; + } + + pte_base = (pte_t *)pmd_page_vaddr(*pmd); + remove_pte_table(pte_base, addr, next, direct); + free_pte_table(pte_base, pmd); + } + + /* Call free_pmd_table() in remove_pud_table(). */ + if (direct) + update_page_count(PG_LEVEL_2M, -pages); +} + +static void __meminit +remove_pud_table(pud_t *pud_start, unsigned long addr, unsigned long end, + struct vmem_altmap *altmap, bool direct) +{ + unsigned long next, pages = 0; + pmd_t *pmd_base; + pud_t *pud; + void *page_addr; + + pud = pud_start + pud_index(addr); + for (; addr < end; addr = next, pud++) { + next = pud_addr_end(addr, end); + + if (!pud_present(*pud)) + continue; + + if (pud_large(*pud)) { + if (IS_ALIGNED(addr, PUD_SIZE) && + IS_ALIGNED(next, PUD_SIZE)) { + if (!direct) + free_pagetable(pud_page(*pud), + get_order(PUD_SIZE)); + + spin_lock(&init_mm.page_table_lock); + pud_clear(pud); + spin_unlock(&init_mm.page_table_lock); + pages++; + } else { + /* If here, we are freeing vmemmap pages. */ + memset((void *)addr, PAGE_INUSE, next - addr); + + page_addr = page_address(pud_page(*pud)); + if (!memchr_inv(page_addr, PAGE_INUSE, + PUD_SIZE)) { + free_pagetable(pud_page(*pud), + get_order(PUD_SIZE)); + + spin_lock(&init_mm.page_table_lock); + pud_clear(pud); + spin_unlock(&init_mm.page_table_lock); + } + } + + continue; + } + + pmd_base = pmd_offset(pud, 0); + remove_pmd_table(pmd_base, addr, next, direct, altmap); + free_pmd_table(pmd_base, pud); + } + + if (direct) + update_page_count(PG_LEVEL_1G, -pages); +} + +static void __meminit +remove_p4d_table(p4d_t *p4d_start, unsigned long addr, unsigned long end, + struct vmem_altmap *altmap, bool direct) +{ + unsigned long next, pages = 0; + pud_t *pud_base; + p4d_t *p4d; + + p4d = p4d_start + p4d_index(addr); + for (; addr < end; addr = next, p4d++) { + next = p4d_addr_end(addr, end); + + if (!p4d_present(*p4d)) + continue; + + BUILD_BUG_ON(p4d_large(*p4d)); + + pud_base = pud_offset(p4d, 0); + remove_pud_table(pud_base, addr, next, altmap, direct); + /* + * For 4-level page tables we do not want to free PUDs, but in the + * 5-level case we should free them. This code will have to change + * to adapt for boot-time switching between 4 and 5 level page tables. + */ + if (pgtable_l5_enabled()) + free_pud_table(pud_base, p4d); + } + + if (direct) + update_page_count(PG_LEVEL_512G, -pages); +} + +/* start and end are both virtual address. */ +static void __meminit +remove_pagetable(unsigned long start, unsigned long end, bool direct, + struct vmem_altmap *altmap) +{ + unsigned long next; + unsigned long addr; + pgd_t *pgd; + p4d_t *p4d; + + for (addr = start; addr < end; addr = next) { + next = pgd_addr_end(addr, end); + + pgd = pgd_offset_k(addr); + if (!pgd_present(*pgd)) + continue; + + p4d = p4d_offset(pgd, 0); + remove_p4d_table(p4d, addr, next, altmap, direct); + } + + flush_tlb_all(); +} + +void __ref vmemmap_free(unsigned long start, unsigned long end, + struct vmem_altmap *altmap) +{ + remove_pagetable(start, end, false, altmap); +} + +static void __meminit +kernel_physical_mapping_remove(unsigned long start, unsigned long end) +{ + start = (unsigned long)__va(start); + end = (unsigned long)__va(end); + + remove_pagetable(start, end, true, NULL); +} + +void __ref arch_remove_memory(int nid, u64 start, u64 size, + struct vmem_altmap *altmap) +{ + unsigned long start_pfn = start >> PAGE_SHIFT; + unsigned long nr_pages = size >> PAGE_SHIFT; + + __remove_pages(start_pfn, nr_pages, altmap); + kernel_physical_mapping_remove(start, start + size); +} +#endif /* CONFIG_MEMORY_HOTPLUG */ + +static struct kcore_list kcore_vsyscall; + +static void __init register_page_bootmem_info(void) +{ +#ifdef CONFIG_NUMA + int i; + + for_each_online_node(i) + register_page_bootmem_info_node(NODE_DATA(i)); +#endif +} + +void __init mem_init(void) +{ + pci_iommu_alloc(); + + /* clear_bss() already clear the empty_zero_page */ + + /* this will put all memory onto the freelists */ + free_all_bootmem(); + after_bootmem = 1; + x86_init.hyper.init_after_bootmem(); + + /* + * Must be done after boot memory is put on freelist, because here we + * might set fields in deferred struct pages that have not yet been + * initialized, and free_all_bootmem() initializes all the reserved + * deferred pages for us. + */ + register_page_bootmem_info(); + + /* Register memory areas for /proc/kcore */ + if (get_gate_vma(&init_mm)) + kclist_add(&kcore_vsyscall, (void *)VSYSCALL_ADDR, PAGE_SIZE, KCORE_USER); + + mem_init_print_info(NULL); +} + +int kernel_set_to_readonly; + +void set_kernel_text_rw(void) +{ + unsigned long start = PFN_ALIGN(_text); + unsigned long end = PFN_ALIGN(__stop___ex_table); + + if (!kernel_set_to_readonly) + return; + + pr_debug("Set kernel text: %lx - %lx for read write\n", + start, end); + + /* + * Make the kernel identity mapping for text RW. Kernel text + * mapping will always be RO. Refer to the comment in + * static_protections() in pageattr.c + */ + set_memory_rw(start, (end - start) >> PAGE_SHIFT); +} + +void set_kernel_text_ro(void) +{ + unsigned long start = PFN_ALIGN(_text); + unsigned long end = PFN_ALIGN(__stop___ex_table); + + if (!kernel_set_to_readonly) + return; + + pr_debug("Set kernel text: %lx - %lx for read only\n", + start, end); + + /* + * Set the kernel identity mapping for text RO. + */ + set_memory_ro(start, (end - start) >> PAGE_SHIFT); +} + +void mark_rodata_ro(void) +{ + unsigned long start = PFN_ALIGN(_text); + unsigned long rodata_start = PFN_ALIGN(__start_rodata); + unsigned long end = (unsigned long) &__end_rodata_hpage_align; + unsigned long text_end = PFN_ALIGN(&__stop___ex_table); + unsigned long rodata_end = PFN_ALIGN(&__end_rodata); + unsigned long all_end; + + printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", + (end - start) >> 10); + set_memory_ro(start, (end - start) >> PAGE_SHIFT); + + kernel_set_to_readonly = 1; + + /* + * The rodata/data/bss/brk section (but not the kernel text!) + * should also be not-executable. + * + * We align all_end to PMD_SIZE because the existing mapping + * is a full PMD. If we would align _brk_end to PAGE_SIZE we + * split the PMD and the reminder between _brk_end and the end + * of the PMD will remain mapped executable. + * + * Any PMD which was setup after the one which covers _brk_end + * has been zapped already via cleanup_highmem(). + */ + all_end = roundup((unsigned long)_brk_end, PMD_SIZE); + set_memory_nx(text_end, (all_end - text_end) >> PAGE_SHIFT); + +#ifdef CONFIG_CPA_DEBUG + printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, end); + set_memory_rw(start, (end-start) >> PAGE_SHIFT); + + printk(KERN_INFO "Testing CPA: again\n"); + set_memory_ro(start, (end-start) >> PAGE_SHIFT); +#endif + + free_kernel_image_pages((void *)text_end, (void *)rodata_start); + free_kernel_image_pages((void *)rodata_end, (void *)_sdata); + + debug_checkwx(); +} + +int kern_addr_valid(unsigned long addr) +{ + unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT; + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + if (above != 0 && above != -1UL) + return 0; + + pgd = pgd_offset_k(addr); + if (pgd_none(*pgd)) + return 0; + + p4d = p4d_offset(pgd, addr); + if (!p4d_present(*p4d)) + return 0; + + pud = pud_offset(p4d, addr); + if (!pud_present(*pud)) + return 0; + + if (pud_large(*pud)) + return pfn_valid(pud_pfn(*pud)); + + pmd = pmd_offset(pud, addr); + if (!pmd_present(*pmd)) + return 0; + + if (pmd_large(*pmd)) + return pfn_valid(pmd_pfn(*pmd)); + + pte = pte_offset_kernel(pmd, addr); + if (pte_none(*pte)) + return 0; + + return pfn_valid(pte_pfn(*pte)); +} + +/* + * Block size is the minimum amount of memory which can be hotplugged or + * hotremoved. It must be power of two and must be equal or larger than + * MIN_MEMORY_BLOCK_SIZE. + */ +#define MAX_BLOCK_SIZE (2UL << 30) + +/* Amount of ram needed to start using large blocks */ +#define MEM_SIZE_FOR_LARGE_BLOCK (64UL << 30) + +/* Adjustable memory block size */ +static unsigned long set_memory_block_size; +int __init set_memory_block_size_order(unsigned int order) +{ + unsigned long size = 1UL << order; + + if (size > MEM_SIZE_FOR_LARGE_BLOCK || size < MIN_MEMORY_BLOCK_SIZE) + return -EINVAL; + + set_memory_block_size = size; + return 0; +} + +static unsigned long probe_memory_block_size(void) +{ + unsigned long boot_mem_end = max_pfn << PAGE_SHIFT; + unsigned long bz; + + /* If memory block size has been set, then use it */ + bz = set_memory_block_size; + if (bz) + goto done; + + /* Use regular block if RAM is smaller than MEM_SIZE_FOR_LARGE_BLOCK */ + if (boot_mem_end < MEM_SIZE_FOR_LARGE_BLOCK) { + bz = MIN_MEMORY_BLOCK_SIZE; + goto done; + } + + /* Find the largest allowed block size that aligns to memory end */ + for (bz = MAX_BLOCK_SIZE; bz > MIN_MEMORY_BLOCK_SIZE; bz >>= 1) { + if (IS_ALIGNED(boot_mem_end, bz)) + break; + } +done: + pr_info("x86/mm: Memory block size: %ldMB\n", bz >> 20); + + return bz; +} + +static unsigned long memory_block_size_probed; +unsigned long memory_block_size_bytes(void) +{ + if (!memory_block_size_probed) + memory_block_size_probed = probe_memory_block_size(); + + return memory_block_size_probed; +} + +#ifdef CONFIG_SPARSEMEM_VMEMMAP +/* + * Initialise the sparsemem vmemmap using huge-pages at the PMD level. + */ +static long __meminitdata addr_start, addr_end; +static void __meminitdata *p_start, *p_end; +static int __meminitdata node_start; + +static int __meminit vmemmap_populate_hugepages(unsigned long start, + unsigned long end, int node, struct vmem_altmap *altmap) +{ + unsigned long addr; + unsigned long next; + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + + for (addr = start; addr < end; addr = next) { + next = pmd_addr_end(addr, end); + + pgd = vmemmap_pgd_populate(addr, node); + if (!pgd) + return -ENOMEM; + + p4d = vmemmap_p4d_populate(pgd, addr, node); + if (!p4d) + return -ENOMEM; + + pud = vmemmap_pud_populate(p4d, addr, node); + if (!pud) + return -ENOMEM; + + pmd = pmd_offset(pud, addr); + if (pmd_none(*pmd)) { + void *p; + + if (altmap) + p = altmap_alloc_block_buf(PMD_SIZE, altmap); + else + p = vmemmap_alloc_block_buf(PMD_SIZE, node); + if (p) { + pte_t entry; + + entry = pfn_pte(__pa(p) >> PAGE_SHIFT, + PAGE_KERNEL_LARGE); + set_pmd(pmd, __pmd(pte_val(entry))); + + /* check to see if we have contiguous blocks */ + if (p_end != p || node_start != node) { + if (p_start) + pr_debug(" [%lx-%lx] PMD -> [%p-%p] on node %d\n", + addr_start, addr_end-1, p_start, p_end-1, node_start); + addr_start = addr; + node_start = node; + p_start = p; + } + + addr_end = addr + PMD_SIZE; + p_end = p + PMD_SIZE; + continue; + } else if (altmap) + return -ENOMEM; /* no fallback */ + } else if (pmd_large(*pmd)) { + vmemmap_verify((pte_t *)pmd, node, addr, next); + continue; + } + if (vmemmap_populate_basepages(addr, next, node)) + return -ENOMEM; + } + return 0; +} + +int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, + struct vmem_altmap *altmap) +{ + int err; + + if (boot_cpu_has(X86_FEATURE_PSE)) + err = vmemmap_populate_hugepages(start, end, node, altmap); + else if (altmap) { + pr_err_once("%s: no cpu support for altmap allocations\n", + __func__); + err = -ENOMEM; + } else + err = vmemmap_populate_basepages(start, end, node); + if (!err) + sync_global_pgds(start, end - 1); + return err; +} + +#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) && defined(CONFIG_HAVE_BOOTMEM_INFO_NODE) +void register_page_bootmem_memmap(unsigned long section_nr, + struct page *start_page, unsigned long nr_pages) +{ + unsigned long addr = (unsigned long)start_page; + unsigned long end = (unsigned long)(start_page + nr_pages); + unsigned long next; + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + unsigned int nr_pmd_pages; + struct page *page; + + for (; addr < end; addr = next) { + pte_t *pte = NULL; + + pgd = pgd_offset_k(addr); + if (pgd_none(*pgd)) { + next = (addr + PAGE_SIZE) & PAGE_MASK; + continue; + } + get_page_bootmem(section_nr, pgd_page(*pgd), MIX_SECTION_INFO); + + p4d = p4d_offset(pgd, addr); + if (p4d_none(*p4d)) { + next = (addr + PAGE_SIZE) & PAGE_MASK; + continue; + } + get_page_bootmem(section_nr, p4d_page(*p4d), MIX_SECTION_INFO); + + pud = pud_offset(p4d, addr); + if (pud_none(*pud)) { + next = (addr + PAGE_SIZE) & PAGE_MASK; + continue; + } + get_page_bootmem(section_nr, pud_page(*pud), MIX_SECTION_INFO); + + if (!boot_cpu_has(X86_FEATURE_PSE)) { + next = (addr + PAGE_SIZE) & PAGE_MASK; + pmd = pmd_offset(pud, addr); + if (pmd_none(*pmd)) + continue; + get_page_bootmem(section_nr, pmd_page(*pmd), + MIX_SECTION_INFO); + + pte = pte_offset_kernel(pmd, addr); + if (pte_none(*pte)) + continue; + get_page_bootmem(section_nr, pte_page(*pte), + SECTION_INFO); + } else { + next = pmd_addr_end(addr, end); + + pmd = pmd_offset(pud, addr); + if (pmd_none(*pmd)) + continue; + + nr_pmd_pages = 1 << get_order(PMD_SIZE); + page = pmd_page(*pmd); + while (nr_pmd_pages--) + get_page_bootmem(section_nr, page++, + SECTION_INFO); + } + } +} +#endif + +void __meminit vmemmap_populate_print_last(void) +{ + if (p_start) { + pr_debug(" [%lx-%lx] PMD -> [%p-%p] on node %d\n", + addr_start, addr_end-1, p_start, p_end-1, node_start); + p_start = NULL; + p_end = NULL; + node_start = 0; + } +} +#endif diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c new file mode 100644 index 000000000..b3294d367 --- /dev/null +++ b/arch/x86/mm/iomap_32.c @@ -0,0 +1,129 @@ +/* + * Copyright © 2008 Ingo Molnar + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. + */ + +#include <asm/iomap.h> +#include <asm/pat.h> +#include <linux/export.h> +#include <linux/highmem.h> + +static int is_io_mapping_possible(resource_size_t base, unsigned long size) +{ +#if !defined(CONFIG_X86_PAE) && defined(CONFIG_PHYS_ADDR_T_64BIT) + /* There is no way to map greater than 1 << 32 address without PAE */ + if (base + size > 0x100000000ULL) + return 0; +#endif + return 1; +} + +int iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot) +{ + enum page_cache_mode pcm = _PAGE_CACHE_MODE_WC; + int ret; + + if (!is_io_mapping_possible(base, size)) + return -EINVAL; + + ret = io_reserve_memtype(base, base + size, &pcm); + if (ret) + return ret; + + *prot = __pgprot(__PAGE_KERNEL | cachemode2protval(pcm)); + /* Filter out unsupported __PAGE_KERNEL* bits: */ + pgprot_val(*prot) &= __default_kernel_pte_mask; + + return 0; +} +EXPORT_SYMBOL_GPL(iomap_create_wc); + +void iomap_free(resource_size_t base, unsigned long size) +{ + io_free_memtype(base, base + size); +} +EXPORT_SYMBOL_GPL(iomap_free); + +void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot) +{ + unsigned long vaddr; + int idx, type; + + preempt_disable(); + pagefault_disable(); + + type = kmap_atomic_idx_push(); + idx = type + KM_TYPE_NR * smp_processor_id(); + vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); + set_pte(kmap_pte - idx, pfn_pte(pfn, prot)); + arch_flush_lazy_mmu_mode(); + + return (void *)vaddr; +} + +/* + * Map 'pfn' using protections 'prot' + */ +void __iomem * +iomap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot) +{ + /* + * For non-PAT systems, translate non-WB request to UC- just in + * case the caller set the PWT bit to prot directly without using + * pgprot_writecombine(). UC- translates to uncached if the MTRR + * is UC or WC. UC- gets the real intention, of the user, which is + * "WC if the MTRR is WC, UC if you can't do that." + */ + if (!pat_enabled() && pgprot2cachemode(prot) != _PAGE_CACHE_MODE_WB) + prot = __pgprot(__PAGE_KERNEL | + cachemode2protval(_PAGE_CACHE_MODE_UC_MINUS)); + + /* Filter out unsupported __PAGE_KERNEL* bits: */ + pgprot_val(prot) &= __default_kernel_pte_mask; + + return (void __force __iomem *) kmap_atomic_prot_pfn(pfn, prot); +} +EXPORT_SYMBOL_GPL(iomap_atomic_prot_pfn); + +void +iounmap_atomic(void __iomem *kvaddr) +{ + unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; + + if (vaddr >= __fix_to_virt(FIX_KMAP_END) && + vaddr <= __fix_to_virt(FIX_KMAP_BEGIN)) { + int idx, type; + + type = kmap_atomic_idx(); + idx = type + KM_TYPE_NR * smp_processor_id(); + +#ifdef CONFIG_DEBUG_HIGHMEM + WARN_ON_ONCE(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx)); +#endif + /* + * Force other mappings to Oops if they'll try to access this + * pte without first remap it. Keeping stale mappings around + * is a bad idea also, in case the page changes cacheability + * attributes or becomes a protected page in a hypervisor. + */ + kpte_clear_flush(kmap_pte-idx, vaddr); + kmap_atomic_idx_pop(); + } + + pagefault_enable(); + preempt_enable(); +} +EXPORT_SYMBOL_GPL(iounmap_atomic); diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c new file mode 100644 index 000000000..adc77904f --- /dev/null +++ b/arch/x86/mm/ioremap.c @@ -0,0 +1,827 @@ +/* + * Re-map IO memory to kernel address space so that we can access it. + * This is needed for high PCI addresses that aren't mapped in the + * 640k-1MB IO memory area on PC's + * + * (C) Copyright 1995 1996 Linus Torvalds + */ + +#include <linux/bootmem.h> +#include <linux/init.h> +#include <linux/io.h> +#include <linux/ioport.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <linux/mmiotrace.h> +#include <linux/mem_encrypt.h> +#include <linux/efi.h> + +#include <asm/set_memory.h> +#include <asm/e820/api.h> +#include <asm/fixmap.h> +#include <asm/pgtable.h> +#include <asm/tlbflush.h> +#include <asm/pgalloc.h> +#include <asm/pat.h> +#include <asm/setup.h> + +#include "physaddr.h" + +struct ioremap_mem_flags { + bool system_ram; + bool desc_other; +}; + +/* + * Fix up the linear direct mapping of the kernel to avoid cache attribute + * conflicts. + */ +int ioremap_change_attr(unsigned long vaddr, unsigned long size, + enum page_cache_mode pcm) +{ + unsigned long nrpages = size >> PAGE_SHIFT; + int err; + + switch (pcm) { + case _PAGE_CACHE_MODE_UC: + default: + err = _set_memory_uc(vaddr, nrpages); + break; + case _PAGE_CACHE_MODE_WC: + err = _set_memory_wc(vaddr, nrpages); + break; + case _PAGE_CACHE_MODE_WT: + err = _set_memory_wt(vaddr, nrpages); + break; + case _PAGE_CACHE_MODE_WB: + err = _set_memory_wb(vaddr, nrpages); + break; + } + + return err; +} + +static bool __ioremap_check_ram(struct resource *res) +{ + unsigned long start_pfn, stop_pfn; + unsigned long i; + + if ((res->flags & IORESOURCE_SYSTEM_RAM) != IORESOURCE_SYSTEM_RAM) + return false; + + start_pfn = (res->start + PAGE_SIZE - 1) >> PAGE_SHIFT; + stop_pfn = (res->end + 1) >> PAGE_SHIFT; + if (stop_pfn > start_pfn) { + for (i = 0; i < (stop_pfn - start_pfn); ++i) + if (pfn_valid(start_pfn + i) && + !PageReserved(pfn_to_page(start_pfn + i))) + return true; + } + + return false; +} + +static int __ioremap_check_desc_other(struct resource *res) +{ + return (res->desc != IORES_DESC_NONE); +} + +static int __ioremap_res_check(struct resource *res, void *arg) +{ + struct ioremap_mem_flags *flags = arg; + + if (!flags->system_ram) + flags->system_ram = __ioremap_check_ram(res); + + if (!flags->desc_other) + flags->desc_other = __ioremap_check_desc_other(res); + + return flags->system_ram && flags->desc_other; +} + +/* + * To avoid multiple resource walks, this function walks resources marked as + * IORESOURCE_MEM and IORESOURCE_BUSY and looking for system RAM and/or a + * resource described not as IORES_DESC_NONE (e.g. IORES_DESC_ACPI_TABLES). + */ +static void __ioremap_check_mem(resource_size_t addr, unsigned long size, + struct ioremap_mem_flags *flags) +{ + u64 start, end; + + start = (u64)addr; + end = start + size - 1; + memset(flags, 0, sizeof(*flags)); + + walk_mem_res(start, end, flags, __ioremap_res_check); +} + +/* + * Remap an arbitrary physical address space into the kernel virtual + * address space. It transparently creates kernel huge I/O mapping when + * the physical address is aligned by a huge page size (1GB or 2MB) and + * the requested size is at least the huge page size. + * + * NOTE: MTRRs can override PAT memory types with a 4KB granularity. + * Therefore, the mapping code falls back to use a smaller page toward 4KB + * when a mapping range is covered by non-WB type of MTRRs. + * + * NOTE! We need to allow non-page-aligned mappings too: we will obviously + * have to convert them into an offset in a page-aligned mapping, but the + * caller shouldn't need to know that small detail. + */ +static void __iomem *__ioremap_caller(resource_size_t phys_addr, + unsigned long size, enum page_cache_mode pcm, void *caller) +{ + unsigned long offset, vaddr; + resource_size_t last_addr; + const resource_size_t unaligned_phys_addr = phys_addr; + const unsigned long unaligned_size = size; + struct ioremap_mem_flags mem_flags; + struct vm_struct *area; + enum page_cache_mode new_pcm; + pgprot_t prot; + int retval; + void __iomem *ret_addr; + + /* Don't allow wraparound or zero size */ + last_addr = phys_addr + size - 1; + if (!size || last_addr < phys_addr) + return NULL; + + if (!phys_addr_valid(phys_addr)) { + printk(KERN_WARNING "ioremap: invalid physical address %llx\n", + (unsigned long long)phys_addr); + WARN_ON_ONCE(1); + return NULL; + } + + __ioremap_check_mem(phys_addr, size, &mem_flags); + + /* + * Don't allow anybody to remap normal RAM that we're using.. + */ + if (mem_flags.system_ram) { + WARN_ONCE(1, "ioremap on RAM at %pa - %pa\n", + &phys_addr, &last_addr); + return NULL; + } + + /* + * Mappings have to be page-aligned + */ + offset = phys_addr & ~PAGE_MASK; + phys_addr &= PHYSICAL_PAGE_MASK; + size = PAGE_ALIGN(last_addr+1) - phys_addr; + + retval = reserve_memtype(phys_addr, (u64)phys_addr + size, + pcm, &new_pcm); + if (retval) { + printk(KERN_ERR "ioremap reserve_memtype failed %d\n", retval); + return NULL; + } + + if (pcm != new_pcm) { + if (!is_new_memtype_allowed(phys_addr, size, pcm, new_pcm)) { + printk(KERN_ERR + "ioremap error for 0x%llx-0x%llx, requested 0x%x, got 0x%x\n", + (unsigned long long)phys_addr, + (unsigned long long)(phys_addr + size), + pcm, new_pcm); + goto err_free_memtype; + } + pcm = new_pcm; + } + + /* + * If the page being mapped is in memory and SEV is active then + * make sure the memory encryption attribute is enabled in the + * resulting mapping. + */ + prot = PAGE_KERNEL_IO; + if (sev_active() && mem_flags.desc_other) + prot = pgprot_encrypted(prot); + + switch (pcm) { + case _PAGE_CACHE_MODE_UC: + default: + prot = __pgprot(pgprot_val(prot) | + cachemode2protval(_PAGE_CACHE_MODE_UC)); + break; + case _PAGE_CACHE_MODE_UC_MINUS: + prot = __pgprot(pgprot_val(prot) | + cachemode2protval(_PAGE_CACHE_MODE_UC_MINUS)); + break; + case _PAGE_CACHE_MODE_WC: + prot = __pgprot(pgprot_val(prot) | + cachemode2protval(_PAGE_CACHE_MODE_WC)); + break; + case _PAGE_CACHE_MODE_WT: + prot = __pgprot(pgprot_val(prot) | + cachemode2protval(_PAGE_CACHE_MODE_WT)); + break; + case _PAGE_CACHE_MODE_WB: + break; + } + + /* + * Ok, go for it.. + */ + area = get_vm_area_caller(size, VM_IOREMAP, caller); + if (!area) + goto err_free_memtype; + area->phys_addr = phys_addr; + vaddr = (unsigned long) area->addr; + + if (kernel_map_sync_memtype(phys_addr, size, pcm)) + goto err_free_area; + + if (ioremap_page_range(vaddr, vaddr + size, phys_addr, prot)) + goto err_free_area; + + ret_addr = (void __iomem *) (vaddr + offset); + mmiotrace_ioremap(unaligned_phys_addr, unaligned_size, ret_addr); + + /* + * Check if the request spans more than any BAR in the iomem resource + * tree. + */ + if (iomem_map_sanity_check(unaligned_phys_addr, unaligned_size)) + pr_warn("caller %pS mapping multiple BARs\n", caller); + + return ret_addr; +err_free_area: + free_vm_area(area); +err_free_memtype: + free_memtype(phys_addr, phys_addr + size); + return NULL; +} + +/** + * ioremap_nocache - map bus memory into CPU space + * @phys_addr: bus address of the memory + * @size: size of the resource to map + * + * ioremap_nocache performs a platform specific sequence of operations to + * make bus memory CPU accessible via the readb/readw/readl/writeb/ + * writew/writel functions and the other mmio helpers. The returned + * address is not guaranteed to be usable directly as a virtual + * address. + * + * This version of ioremap ensures that the memory is marked uncachable + * on the CPU as well as honouring existing caching rules from things like + * the PCI bus. Note that there are other caches and buffers on many + * busses. In particular driver authors should read up on PCI writes + * + * It's useful if some control registers are in such an area and + * write combining or read caching is not desirable: + * + * Must be freed with iounmap. + */ +void __iomem *ioremap_nocache(resource_size_t phys_addr, unsigned long size) +{ + /* + * Ideally, this should be: + * pat_enabled() ? _PAGE_CACHE_MODE_UC : _PAGE_CACHE_MODE_UC_MINUS; + * + * Till we fix all X drivers to use ioremap_wc(), we will use + * UC MINUS. Drivers that are certain they need or can already + * be converted over to strong UC can use ioremap_uc(). + */ + enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC_MINUS; + + return __ioremap_caller(phys_addr, size, pcm, + __builtin_return_address(0)); +} +EXPORT_SYMBOL(ioremap_nocache); + +/** + * ioremap_uc - map bus memory into CPU space as strongly uncachable + * @phys_addr: bus address of the memory + * @size: size of the resource to map + * + * ioremap_uc performs a platform specific sequence of operations to + * make bus memory CPU accessible via the readb/readw/readl/writeb/ + * writew/writel functions and the other mmio helpers. The returned + * address is not guaranteed to be usable directly as a virtual + * address. + * + * This version of ioremap ensures that the memory is marked with a strong + * preference as completely uncachable on the CPU when possible. For non-PAT + * systems this ends up setting page-attribute flags PCD=1, PWT=1. For PAT + * systems this will set the PAT entry for the pages as strong UC. This call + * will honor existing caching rules from things like the PCI bus. Note that + * there are other caches and buffers on many busses. In particular driver + * authors should read up on PCI writes. + * + * It's useful if some control registers are in such an area and + * write combining or read caching is not desirable: + * + * Must be freed with iounmap. + */ +void __iomem *ioremap_uc(resource_size_t phys_addr, unsigned long size) +{ + enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC; + + return __ioremap_caller(phys_addr, size, pcm, + __builtin_return_address(0)); +} +EXPORT_SYMBOL_GPL(ioremap_uc); + +/** + * ioremap_wc - map memory into CPU space write combined + * @phys_addr: bus address of the memory + * @size: size of the resource to map + * + * This version of ioremap ensures that the memory is marked write combining. + * Write combining allows faster writes to some hardware devices. + * + * Must be freed with iounmap. + */ +void __iomem *ioremap_wc(resource_size_t phys_addr, unsigned long size) +{ + return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WC, + __builtin_return_address(0)); +} +EXPORT_SYMBOL(ioremap_wc); + +/** + * ioremap_wt - map memory into CPU space write through + * @phys_addr: bus address of the memory + * @size: size of the resource to map + * + * This version of ioremap ensures that the memory is marked write through. + * Write through stores data into memory while keeping the cache up-to-date. + * + * Must be freed with iounmap. + */ +void __iomem *ioremap_wt(resource_size_t phys_addr, unsigned long size) +{ + return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WT, + __builtin_return_address(0)); +} +EXPORT_SYMBOL(ioremap_wt); + +void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size) +{ + return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WB, + __builtin_return_address(0)); +} +EXPORT_SYMBOL(ioremap_cache); + +void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size, + unsigned long prot_val) +{ + return __ioremap_caller(phys_addr, size, + pgprot2cachemode(__pgprot(prot_val)), + __builtin_return_address(0)); +} +EXPORT_SYMBOL(ioremap_prot); + +/** + * iounmap - Free a IO remapping + * @addr: virtual address from ioremap_* + * + * Caller must ensure there is only one unmapping for the same pointer. + */ +void iounmap(volatile void __iomem *addr) +{ + struct vm_struct *p, *o; + + if ((void __force *)addr <= high_memory) + return; + + /* + * The PCI/ISA range special-casing was removed from __ioremap() + * so this check, in theory, can be removed. However, there are + * cases where iounmap() is called for addresses not obtained via + * ioremap() (vga16fb for example). Add a warning so that these + * cases can be caught and fixed. + */ + if ((void __force *)addr >= phys_to_virt(ISA_START_ADDRESS) && + (void __force *)addr < phys_to_virt(ISA_END_ADDRESS)) { + WARN(1, "iounmap() called for ISA range not obtained using ioremap()\n"); + return; + } + + mmiotrace_iounmap(addr); + + addr = (volatile void __iomem *) + (PAGE_MASK & (unsigned long __force)addr); + + /* Use the vm area unlocked, assuming the caller + ensures there isn't another iounmap for the same address + in parallel. Reuse of the virtual address is prevented by + leaving it in the global lists until we're done with it. + cpa takes care of the direct mappings. */ + p = find_vm_area((void __force *)addr); + + if (!p) { + printk(KERN_ERR "iounmap: bad address %p\n", addr); + dump_stack(); + return; + } + + free_memtype(p->phys_addr, p->phys_addr + get_vm_area_size(p)); + + /* Finally remove it */ + o = remove_vm_area((void __force *)addr); + BUG_ON(p != o || o == NULL); + kfree(p); +} +EXPORT_SYMBOL(iounmap); + +int __init arch_ioremap_pud_supported(void) +{ +#ifdef CONFIG_X86_64 + return boot_cpu_has(X86_FEATURE_GBPAGES); +#else + return 0; +#endif +} + +int __init arch_ioremap_pmd_supported(void) +{ + return boot_cpu_has(X86_FEATURE_PSE); +} + +/* + * Convert a physical pointer to a virtual kernel pointer for /dev/mem + * access + */ +void *xlate_dev_mem_ptr(phys_addr_t phys) +{ + unsigned long start = phys & PAGE_MASK; + unsigned long offset = phys & ~PAGE_MASK; + void *vaddr; + + /* memremap() maps if RAM, otherwise falls back to ioremap() */ + vaddr = memremap(start, PAGE_SIZE, MEMREMAP_WB); + + /* Only add the offset on success and return NULL if memremap() failed */ + if (vaddr) + vaddr += offset; + + return vaddr; +} + +void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr) +{ + memunmap((void *)((unsigned long)addr & PAGE_MASK)); +} + +/* + * Examine the physical address to determine if it is an area of memory + * that should be mapped decrypted. If the memory is not part of the + * kernel usable area it was accessed and created decrypted, so these + * areas should be mapped decrypted. And since the encryption key can + * change across reboots, persistent memory should also be mapped + * decrypted. + * + * If SEV is active, that implies that BIOS/UEFI also ran encrypted so + * only persistent memory should be mapped decrypted. + */ +static bool memremap_should_map_decrypted(resource_size_t phys_addr, + unsigned long size) +{ + int is_pmem; + + /* + * Check if the address is part of a persistent memory region. + * This check covers areas added by E820, EFI and ACPI. + */ + is_pmem = region_intersects(phys_addr, size, IORESOURCE_MEM, + IORES_DESC_PERSISTENT_MEMORY); + if (is_pmem != REGION_DISJOINT) + return true; + + /* + * Check if the non-volatile attribute is set for an EFI + * reserved area. + */ + if (efi_enabled(EFI_BOOT)) { + switch (efi_mem_type(phys_addr)) { + case EFI_RESERVED_TYPE: + if (efi_mem_attributes(phys_addr) & EFI_MEMORY_NV) + return true; + break; + default: + break; + } + } + + /* Check if the address is outside kernel usable area */ + switch (e820__get_entry_type(phys_addr, phys_addr + size - 1)) { + case E820_TYPE_RESERVED: + case E820_TYPE_ACPI: + case E820_TYPE_NVS: + case E820_TYPE_UNUSABLE: + /* For SEV, these areas are encrypted */ + if (sev_active()) + break; + /* Fallthrough */ + + case E820_TYPE_PRAM: + return true; + default: + break; + } + + return false; +} + +/* + * Examine the physical address to determine if it is EFI data. Check + * it against the boot params structure and EFI tables and memory types. + */ +static bool memremap_is_efi_data(resource_size_t phys_addr, + unsigned long size) +{ + u64 paddr; + + /* Check if the address is part of EFI boot/runtime data */ + if (!efi_enabled(EFI_BOOT)) + return false; + + paddr = boot_params.efi_info.efi_memmap_hi; + paddr <<= 32; + paddr |= boot_params.efi_info.efi_memmap; + if (phys_addr == paddr) + return true; + + paddr = boot_params.efi_info.efi_systab_hi; + paddr <<= 32; + paddr |= boot_params.efi_info.efi_systab; + if (phys_addr == paddr) + return true; + + if (efi_is_table_address(phys_addr)) + return true; + + switch (efi_mem_type(phys_addr)) { + case EFI_BOOT_SERVICES_DATA: + case EFI_RUNTIME_SERVICES_DATA: + return true; + default: + break; + } + + return false; +} + +/* + * Examine the physical address to determine if it is boot data by checking + * it against the boot params setup_data chain. + */ +static bool memremap_is_setup_data(resource_size_t phys_addr, + unsigned long size) +{ + struct setup_data *data; + u64 paddr, paddr_next; + + paddr = boot_params.hdr.setup_data; + while (paddr) { + unsigned int len; + + if (phys_addr == paddr) + return true; + + data = memremap(paddr, sizeof(*data), + MEMREMAP_WB | MEMREMAP_DEC); + + paddr_next = data->next; + len = data->len; + + memunmap(data); + + if ((phys_addr > paddr) && (phys_addr < (paddr + len))) + return true; + + paddr = paddr_next; + } + + return false; +} + +/* + * Examine the physical address to determine if it is boot data by checking + * it against the boot params setup_data chain (early boot version). + */ +static bool __init early_memremap_is_setup_data(resource_size_t phys_addr, + unsigned long size) +{ + struct setup_data *data; + u64 paddr, paddr_next; + + paddr = boot_params.hdr.setup_data; + while (paddr) { + unsigned int len; + + if (phys_addr == paddr) + return true; + + data = early_memremap_decrypted(paddr, sizeof(*data)); + + paddr_next = data->next; + len = data->len; + + early_memunmap(data, sizeof(*data)); + + if ((phys_addr > paddr) && (phys_addr < (paddr + len))) + return true; + + paddr = paddr_next; + } + + return false; +} + +/* + * Architecture function to determine if RAM remap is allowed. By default, a + * RAM remap will map the data as encrypted. Determine if a RAM remap should + * not be done so that the data will be mapped decrypted. + */ +bool arch_memremap_can_ram_remap(resource_size_t phys_addr, unsigned long size, + unsigned long flags) +{ + if (!mem_encrypt_active()) + return true; + + if (flags & MEMREMAP_ENC) + return true; + + if (flags & MEMREMAP_DEC) + return false; + + if (sme_active()) { + if (memremap_is_setup_data(phys_addr, size) || + memremap_is_efi_data(phys_addr, size)) + return false; + } + + return !memremap_should_map_decrypted(phys_addr, size); +} + +/* + * Architecture override of __weak function to adjust the protection attributes + * used when remapping memory. By default, early_memremap() will map the data + * as encrypted. Determine if an encrypted mapping should not be done and set + * the appropriate protection attributes. + */ +pgprot_t __init early_memremap_pgprot_adjust(resource_size_t phys_addr, + unsigned long size, + pgprot_t prot) +{ + bool encrypted_prot; + + if (!mem_encrypt_active()) + return prot; + + encrypted_prot = true; + + if (sme_active()) { + if (early_memremap_is_setup_data(phys_addr, size) || + memremap_is_efi_data(phys_addr, size)) + encrypted_prot = false; + } + + if (encrypted_prot && memremap_should_map_decrypted(phys_addr, size)) + encrypted_prot = false; + + return encrypted_prot ? pgprot_encrypted(prot) + : pgprot_decrypted(prot); +} + +bool phys_mem_access_encrypted(unsigned long phys_addr, unsigned long size) +{ + return arch_memremap_can_ram_remap(phys_addr, size, 0); +} + +#ifdef CONFIG_AMD_MEM_ENCRYPT +/* Remap memory with encryption */ +void __init *early_memremap_encrypted(resource_size_t phys_addr, + unsigned long size) +{ + return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_ENC); +} + +/* + * Remap memory with encryption and write-protected - cannot be called + * before pat_init() is called + */ +void __init *early_memremap_encrypted_wp(resource_size_t phys_addr, + unsigned long size) +{ + /* Be sure the write-protect PAT entry is set for write-protect */ + if (__pte2cachemode_tbl[_PAGE_CACHE_MODE_WP] != _PAGE_CACHE_MODE_WP) + return NULL; + + return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_ENC_WP); +} + +/* Remap memory without encryption */ +void __init *early_memremap_decrypted(resource_size_t phys_addr, + unsigned long size) +{ + return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_NOENC); +} + +/* + * Remap memory without encryption and write-protected - cannot be called + * before pat_init() is called + */ +void __init *early_memremap_decrypted_wp(resource_size_t phys_addr, + unsigned long size) +{ + /* Be sure the write-protect PAT entry is set for write-protect */ + if (__pte2cachemode_tbl[_PAGE_CACHE_MODE_WP] != _PAGE_CACHE_MODE_WP) + return NULL; + + return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_NOENC_WP); +} +#endif /* CONFIG_AMD_MEM_ENCRYPT */ + +static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss; + +static inline pmd_t * __init early_ioremap_pmd(unsigned long addr) +{ + /* Don't assume we're using swapper_pg_dir at this point */ + pgd_t *base = __va(read_cr3_pa()); + pgd_t *pgd = &base[pgd_index(addr)]; + p4d_t *p4d = p4d_offset(pgd, addr); + pud_t *pud = pud_offset(p4d, addr); + pmd_t *pmd = pmd_offset(pud, addr); + + return pmd; +} + +static inline pte_t * __init early_ioremap_pte(unsigned long addr) +{ + return &bm_pte[pte_index(addr)]; +} + +bool __init is_early_ioremap_ptep(pte_t *ptep) +{ + return ptep >= &bm_pte[0] && ptep < &bm_pte[PAGE_SIZE/sizeof(pte_t)]; +} + +void __init early_ioremap_init(void) +{ + pmd_t *pmd; + +#ifdef CONFIG_X86_64 + BUILD_BUG_ON((fix_to_virt(0) + PAGE_SIZE) & ((1 << PMD_SHIFT) - 1)); +#else + WARN_ON((fix_to_virt(0) + PAGE_SIZE) & ((1 << PMD_SHIFT) - 1)); +#endif + + early_ioremap_setup(); + + pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN)); + memset(bm_pte, 0, sizeof(bm_pte)); + pmd_populate_kernel(&init_mm, pmd, bm_pte); + + /* + * The boot-ioremap range spans multiple pmds, for which + * we are not prepared: + */ +#define __FIXADDR_TOP (-PAGE_SIZE) + BUILD_BUG_ON((__fix_to_virt(FIX_BTMAP_BEGIN) >> PMD_SHIFT) + != (__fix_to_virt(FIX_BTMAP_END) >> PMD_SHIFT)); +#undef __FIXADDR_TOP + if (pmd != early_ioremap_pmd(fix_to_virt(FIX_BTMAP_END))) { + WARN_ON(1); + printk(KERN_WARNING "pmd %p != %p\n", + pmd, early_ioremap_pmd(fix_to_virt(FIX_BTMAP_END))); + printk(KERN_WARNING "fix_to_virt(FIX_BTMAP_BEGIN): %08lx\n", + fix_to_virt(FIX_BTMAP_BEGIN)); + printk(KERN_WARNING "fix_to_virt(FIX_BTMAP_END): %08lx\n", + fix_to_virt(FIX_BTMAP_END)); + + printk(KERN_WARNING "FIX_BTMAP_END: %d\n", FIX_BTMAP_END); + printk(KERN_WARNING "FIX_BTMAP_BEGIN: %d\n", + FIX_BTMAP_BEGIN); + } +} + +void __init __early_set_fixmap(enum fixed_addresses idx, + phys_addr_t phys, pgprot_t flags) +{ + unsigned long addr = __fix_to_virt(idx); + pte_t *pte; + + if (idx >= __end_of_fixed_addresses) { + BUG(); + return; + } + pte = early_ioremap_pte(addr); + + /* Sanitize 'prot' against any unsupported bits: */ + pgprot_val(flags) &= __default_kernel_pte_mask; + + if (pgprot_val(flags)) + set_pte(pte, pfn_pte(phys >> PAGE_SHIFT, flags)); + else + pte_clear(&init_mm, addr, pte); + __flush_tlb_one_kernel(addr); +} diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c new file mode 100644 index 000000000..4bfd14d5d --- /dev/null +++ b/arch/x86/mm/kasan_init_64.c @@ -0,0 +1,392 @@ +// SPDX-License-Identifier: GPL-2.0 +#define DISABLE_BRANCH_PROFILING +#define pr_fmt(fmt) "kasan: " fmt + +/* cpu_feature_enabled() cannot be used this early */ +#define USE_EARLY_PGTABLE_L5 + +#include <linux/bootmem.h> +#include <linux/kasan.h> +#include <linux/kdebug.h> +#include <linux/memblock.h> +#include <linux/mm.h> +#include <linux/sched.h> +#include <linux/sched/task.h> +#include <linux/vmalloc.h> + +#include <asm/e820/types.h> +#include <asm/pgalloc.h> +#include <asm/tlbflush.h> +#include <asm/sections.h> +#include <asm/pgtable.h> +#include <asm/cpu_entry_area.h> + +extern struct range pfn_mapped[E820_MAX_ENTRIES]; + +static p4d_t tmp_p4d_table[MAX_PTRS_PER_P4D] __initdata __aligned(PAGE_SIZE); + +static __init void *early_alloc(size_t size, int nid, bool panic) +{ + if (panic) + return memblock_virt_alloc_try_nid(size, size, + __pa(MAX_DMA_ADDRESS), BOOTMEM_ALLOC_ACCESSIBLE, nid); + else + return memblock_virt_alloc_try_nid_nopanic(size, size, + __pa(MAX_DMA_ADDRESS), BOOTMEM_ALLOC_ACCESSIBLE, nid); +} + +static void __init kasan_populate_pmd(pmd_t *pmd, unsigned long addr, + unsigned long end, int nid) +{ + pte_t *pte; + + if (pmd_none(*pmd)) { + void *p; + + if (boot_cpu_has(X86_FEATURE_PSE) && + ((end - addr) == PMD_SIZE) && + IS_ALIGNED(addr, PMD_SIZE)) { + p = early_alloc(PMD_SIZE, nid, false); + if (p && pmd_set_huge(pmd, __pa(p), PAGE_KERNEL)) + return; + else if (p) + memblock_free(__pa(p), PMD_SIZE); + } + + p = early_alloc(PAGE_SIZE, nid, true); + pmd_populate_kernel(&init_mm, pmd, p); + } + + pte = pte_offset_kernel(pmd, addr); + do { + pte_t entry; + void *p; + + if (!pte_none(*pte)) + continue; + + p = early_alloc(PAGE_SIZE, nid, true); + entry = pfn_pte(PFN_DOWN(__pa(p)), PAGE_KERNEL); + set_pte_at(&init_mm, addr, pte, entry); + } while (pte++, addr += PAGE_SIZE, addr != end); +} + +static void __init kasan_populate_pud(pud_t *pud, unsigned long addr, + unsigned long end, int nid) +{ + pmd_t *pmd; + unsigned long next; + + if (pud_none(*pud)) { + void *p; + + if (boot_cpu_has(X86_FEATURE_GBPAGES) && + ((end - addr) == PUD_SIZE) && + IS_ALIGNED(addr, PUD_SIZE)) { + p = early_alloc(PUD_SIZE, nid, false); + if (p && pud_set_huge(pud, __pa(p), PAGE_KERNEL)) + return; + else if (p) + memblock_free(__pa(p), PUD_SIZE); + } + + p = early_alloc(PAGE_SIZE, nid, true); + pud_populate(&init_mm, pud, p); + } + + pmd = pmd_offset(pud, addr); + do { + next = pmd_addr_end(addr, end); + if (!pmd_large(*pmd)) + kasan_populate_pmd(pmd, addr, next, nid); + } while (pmd++, addr = next, addr != end); +} + +static void __init kasan_populate_p4d(p4d_t *p4d, unsigned long addr, + unsigned long end, int nid) +{ + pud_t *pud; + unsigned long next; + + if (p4d_none(*p4d)) { + void *p = early_alloc(PAGE_SIZE, nid, true); + + p4d_populate(&init_mm, p4d, p); + } + + pud = pud_offset(p4d, addr); + do { + next = pud_addr_end(addr, end); + if (!pud_large(*pud)) + kasan_populate_pud(pud, addr, next, nid); + } while (pud++, addr = next, addr != end); +} + +static void __init kasan_populate_pgd(pgd_t *pgd, unsigned long addr, + unsigned long end, int nid) +{ + void *p; + p4d_t *p4d; + unsigned long next; + + if (pgd_none(*pgd)) { + p = early_alloc(PAGE_SIZE, nid, true); + pgd_populate(&init_mm, pgd, p); + } + + p4d = p4d_offset(pgd, addr); + do { + next = p4d_addr_end(addr, end); + kasan_populate_p4d(p4d, addr, next, nid); + } while (p4d++, addr = next, addr != end); +} + +static void __init kasan_populate_shadow(unsigned long addr, unsigned long end, + int nid) +{ + pgd_t *pgd; + unsigned long next; + + addr = addr & PAGE_MASK; + end = round_up(end, PAGE_SIZE); + pgd = pgd_offset_k(addr); + do { + next = pgd_addr_end(addr, end); + kasan_populate_pgd(pgd, addr, next, nid); + } while (pgd++, addr = next, addr != end); +} + +static void __init map_range(struct range *range) +{ + unsigned long start; + unsigned long end; + + start = (unsigned long)kasan_mem_to_shadow(pfn_to_kaddr(range->start)); + end = (unsigned long)kasan_mem_to_shadow(pfn_to_kaddr(range->end)); + + kasan_populate_shadow(start, end, early_pfn_to_nid(range->start)); +} + +static void __init clear_pgds(unsigned long start, + unsigned long end) +{ + pgd_t *pgd; + /* See comment in kasan_init() */ + unsigned long pgd_end = end & PGDIR_MASK; + + for (; start < pgd_end; start += PGDIR_SIZE) { + pgd = pgd_offset_k(start); + /* + * With folded p4d, pgd_clear() is nop, use p4d_clear() + * instead. + */ + if (pgtable_l5_enabled()) + pgd_clear(pgd); + else + p4d_clear(p4d_offset(pgd, start)); + } + + pgd = pgd_offset_k(start); + for (; start < end; start += P4D_SIZE) + p4d_clear(p4d_offset(pgd, start)); +} + +static inline p4d_t *early_p4d_offset(pgd_t *pgd, unsigned long addr) +{ + unsigned long p4d; + + if (!pgtable_l5_enabled()) + return (p4d_t *)pgd; + + p4d = pgd_val(*pgd) & PTE_PFN_MASK; + p4d += __START_KERNEL_map - phys_base; + return (p4d_t *)p4d + p4d_index(addr); +} + +static void __init kasan_early_p4d_populate(pgd_t *pgd, + unsigned long addr, + unsigned long end) +{ + pgd_t pgd_entry; + p4d_t *p4d, p4d_entry; + unsigned long next; + + if (pgd_none(*pgd)) { + pgd_entry = __pgd(_KERNPG_TABLE | __pa_nodebug(kasan_zero_p4d)); + set_pgd(pgd, pgd_entry); + } + + p4d = early_p4d_offset(pgd, addr); + do { + next = p4d_addr_end(addr, end); + + if (!p4d_none(*p4d)) + continue; + + p4d_entry = __p4d(_KERNPG_TABLE | __pa_nodebug(kasan_zero_pud)); + set_p4d(p4d, p4d_entry); + } while (p4d++, addr = next, addr != end && p4d_none(*p4d)); +} + +static void __init kasan_map_early_shadow(pgd_t *pgd) +{ + /* See comment in kasan_init() */ + unsigned long addr = KASAN_SHADOW_START & PGDIR_MASK; + unsigned long end = KASAN_SHADOW_END; + unsigned long next; + + pgd += pgd_index(addr); + do { + next = pgd_addr_end(addr, end); + kasan_early_p4d_populate(pgd, addr, next); + } while (pgd++, addr = next, addr != end); +} + +#ifdef CONFIG_KASAN_INLINE +static int kasan_die_handler(struct notifier_block *self, + unsigned long val, + void *data) +{ + if (val == DIE_GPF) { + pr_emerg("CONFIG_KASAN_INLINE enabled\n"); + pr_emerg("GPF could be caused by NULL-ptr deref or user memory access\n"); + } + return NOTIFY_OK; +} + +static struct notifier_block kasan_die_notifier = { + .notifier_call = kasan_die_handler, +}; +#endif + +void __init kasan_early_init(void) +{ + int i; + pteval_t pte_val = __pa_nodebug(kasan_zero_page) | __PAGE_KERNEL | _PAGE_ENC; + pmdval_t pmd_val = __pa_nodebug(kasan_zero_pte) | _KERNPG_TABLE; + pudval_t pud_val = __pa_nodebug(kasan_zero_pmd) | _KERNPG_TABLE; + p4dval_t p4d_val = __pa_nodebug(kasan_zero_pud) | _KERNPG_TABLE; + + /* Mask out unsupported __PAGE_KERNEL bits: */ + pte_val &= __default_kernel_pte_mask; + pmd_val &= __default_kernel_pte_mask; + pud_val &= __default_kernel_pte_mask; + p4d_val &= __default_kernel_pte_mask; + + for (i = 0; i < PTRS_PER_PTE; i++) + kasan_zero_pte[i] = __pte(pte_val); + + for (i = 0; i < PTRS_PER_PMD; i++) + kasan_zero_pmd[i] = __pmd(pmd_val); + + for (i = 0; i < PTRS_PER_PUD; i++) + kasan_zero_pud[i] = __pud(pud_val); + + for (i = 0; pgtable_l5_enabled() && i < PTRS_PER_P4D; i++) + kasan_zero_p4d[i] = __p4d(p4d_val); + + kasan_map_early_shadow(early_top_pgt); + kasan_map_early_shadow(init_top_pgt); +} + +void __init kasan_init(void) +{ + int i; + void *shadow_cpu_entry_begin, *shadow_cpu_entry_end; + +#ifdef CONFIG_KASAN_INLINE + register_die_notifier(&kasan_die_notifier); +#endif + + memcpy(early_top_pgt, init_top_pgt, sizeof(early_top_pgt)); + + /* + * We use the same shadow offset for 4- and 5-level paging to + * facilitate boot-time switching between paging modes. + * As result in 5-level paging mode KASAN_SHADOW_START and + * KASAN_SHADOW_END are not aligned to PGD boundary. + * + * KASAN_SHADOW_START doesn't share PGD with anything else. + * We claim whole PGD entry to make things easier. + * + * KASAN_SHADOW_END lands in the last PGD entry and it collides with + * bunch of things like kernel code, modules, EFI mapping, etc. + * We need to take extra steps to not overwrite them. + */ + if (pgtable_l5_enabled()) { + void *ptr; + + ptr = (void *)pgd_page_vaddr(*pgd_offset_k(KASAN_SHADOW_END)); + memcpy(tmp_p4d_table, (void *)ptr, sizeof(tmp_p4d_table)); + set_pgd(&early_top_pgt[pgd_index(KASAN_SHADOW_END)], + __pgd(__pa(tmp_p4d_table) | _KERNPG_TABLE)); + } + + load_cr3(early_top_pgt); + __flush_tlb_all(); + + clear_pgds(KASAN_SHADOW_START & PGDIR_MASK, KASAN_SHADOW_END); + + kasan_populate_zero_shadow((void *)(KASAN_SHADOW_START & PGDIR_MASK), + kasan_mem_to_shadow((void *)PAGE_OFFSET)); + + for (i = 0; i < E820_MAX_ENTRIES; i++) { + if (pfn_mapped[i].end == 0) + break; + + map_range(&pfn_mapped[i]); + } + + shadow_cpu_entry_begin = (void *)CPU_ENTRY_AREA_BASE; + shadow_cpu_entry_begin = kasan_mem_to_shadow(shadow_cpu_entry_begin); + shadow_cpu_entry_begin = (void *)round_down((unsigned long)shadow_cpu_entry_begin, + PAGE_SIZE); + + shadow_cpu_entry_end = (void *)(CPU_ENTRY_AREA_BASE + + CPU_ENTRY_AREA_MAP_SIZE); + shadow_cpu_entry_end = kasan_mem_to_shadow(shadow_cpu_entry_end); + shadow_cpu_entry_end = (void *)round_up((unsigned long)shadow_cpu_entry_end, + PAGE_SIZE); + + kasan_populate_zero_shadow( + kasan_mem_to_shadow((void *)PAGE_OFFSET + MAXMEM), + shadow_cpu_entry_begin); + + kasan_populate_shadow((unsigned long)shadow_cpu_entry_begin, + (unsigned long)shadow_cpu_entry_end, 0); + + kasan_populate_zero_shadow(shadow_cpu_entry_end, + kasan_mem_to_shadow((void *)__START_KERNEL_map)); + + kasan_populate_shadow((unsigned long)kasan_mem_to_shadow(_stext), + (unsigned long)kasan_mem_to_shadow(_end), + early_pfn_to_nid(__pa(_stext))); + + kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END), + (void *)KASAN_SHADOW_END); + + load_cr3(init_top_pgt); + __flush_tlb_all(); + + /* + * kasan_zero_page has been used as early shadow memory, thus it may + * contain some garbage. Now we can clear and write protect it, since + * after the TLB flush no one should write to it. + */ + memset(kasan_zero_page, 0, PAGE_SIZE); + for (i = 0; i < PTRS_PER_PTE; i++) { + pte_t pte; + pgprot_t prot; + + prot = __pgprot(__PAGE_KERNEL_RO | _PAGE_ENC); + pgprot_val(prot) &= __default_kernel_pte_mask; + + pte = __pte(__pa(kasan_zero_page) | pgprot_val(prot)); + set_pte(&kasan_zero_pte[i], pte); + } + /* Flush TLBs again to be sure that write protection applied. */ + __flush_tlb_all(); + + init_task.kasan_depth = 0; + pr_info("KernelAddressSanitizer initialized\n"); +} diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c new file mode 100644 index 000000000..bfe769209 --- /dev/null +++ b/arch/x86/mm/kaslr.c @@ -0,0 +1,228 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * This file implements KASLR memory randomization for x86_64. It randomizes + * the virtual address space of kernel memory regions (physical memory + * mapping, vmalloc & vmemmap) for x86_64. This security feature mitigates + * exploits relying on predictable kernel addresses. + * + * Entropy is generated using the KASLR early boot functions now shared in + * the lib directory (originally written by Kees Cook). Randomization is + * done on PGD & P4D/PUD page table levels to increase possible addresses. + * The physical memory mapping code was adapted to support P4D/PUD level + * virtual addresses. This implementation on the best configuration provides + * 30,000 possible virtual addresses in average for each memory region. + * An additional low memory page is used to ensure each CPU can start with + * a PGD aligned virtual address (for realmode). + * + * The order of each memory region is not changed. The feature looks at + * the available space for the regions based on different configuration + * options and randomizes the base and space between each. The size of the + * physical memory mapping is the available physical memory. + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/random.h> + +#include <asm/pgalloc.h> +#include <asm/pgtable.h> +#include <asm/setup.h> +#include <asm/kaslr.h> + +#include "mm_internal.h" + +#define TB_SHIFT 40 + +/* + * The end address could depend on more configuration options to make the + * highest amount of space for randomization available, but that's too hard + * to keep straight and caused issues already. + */ +static const unsigned long vaddr_end = CPU_ENTRY_AREA_BASE; + +/* + * Memory regions randomized by KASLR (except modules that use a separate logic + * earlier during boot). The list is ordered based on virtual addresses. This + * order is kept after randomization. + */ +static __initdata struct kaslr_memory_region { + unsigned long *base; + unsigned long size_tb; +} kaslr_regions[] = { + { &page_offset_base, 0 }, + { &vmalloc_base, 0 }, + { &vmemmap_base, 0 }, +}; + +/* Get size in bytes used by the memory region */ +static inline unsigned long get_padding(struct kaslr_memory_region *region) +{ + return (region->size_tb << TB_SHIFT); +} + +/* + * Apply no randomization if KASLR was disabled at boot or if KASAN + * is enabled. KASAN shadow mappings rely on regions being PGD aligned. + */ +static inline bool kaslr_memory_enabled(void) +{ + return kaslr_enabled() && !IS_ENABLED(CONFIG_KASAN); +} + +/* Initialize base and padding for each memory region randomized with KASLR */ +void __init kernel_randomize_memory(void) +{ + size_t i; + unsigned long vaddr_start, vaddr; + unsigned long rand, memory_tb; + struct rnd_state rand_state; + unsigned long remain_entropy; + unsigned long vmemmap_size; + + vaddr_start = pgtable_l5_enabled() ? __PAGE_OFFSET_BASE_L5 : __PAGE_OFFSET_BASE_L4; + vaddr = vaddr_start; + + /* + * These BUILD_BUG_ON checks ensure the memory layout is consistent + * with the vaddr_start/vaddr_end variables. These checks are very + * limited.... + */ + BUILD_BUG_ON(vaddr_start >= vaddr_end); + BUILD_BUG_ON(vaddr_end != CPU_ENTRY_AREA_BASE); + BUILD_BUG_ON(vaddr_end > __START_KERNEL_map); + + if (!kaslr_memory_enabled()) + return; + + kaslr_regions[0].size_tb = 1 << (MAX_PHYSMEM_BITS - TB_SHIFT); + kaslr_regions[1].size_tb = VMALLOC_SIZE_TB; + + /* + * Update Physical memory mapping to available and + * add padding if needed (especially for memory hotplug support). + */ + BUG_ON(kaslr_regions[0].base != &page_offset_base); + memory_tb = DIV_ROUND_UP(max_pfn << PAGE_SHIFT, 1UL << TB_SHIFT) + + CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING; + + /* Adapt phyiscal memory region size based on available memory */ + if (memory_tb < kaslr_regions[0].size_tb) + kaslr_regions[0].size_tb = memory_tb; + + /* + * Calculate the vmemmap region size in TBs, aligned to a TB + * boundary. + */ + vmemmap_size = (kaslr_regions[0].size_tb << (TB_SHIFT - PAGE_SHIFT)) * + sizeof(struct page); + kaslr_regions[2].size_tb = DIV_ROUND_UP(vmemmap_size, 1UL << TB_SHIFT); + + /* Calculate entropy available between regions */ + remain_entropy = vaddr_end - vaddr_start; + for (i = 0; i < ARRAY_SIZE(kaslr_regions); i++) + remain_entropy -= get_padding(&kaslr_regions[i]); + + prandom_seed_state(&rand_state, kaslr_get_random_long("Memory")); + + for (i = 0; i < ARRAY_SIZE(kaslr_regions); i++) { + unsigned long entropy; + + /* + * Select a random virtual address using the extra entropy + * available. + */ + entropy = remain_entropy / (ARRAY_SIZE(kaslr_regions) - i); + prandom_bytes_state(&rand_state, &rand, sizeof(rand)); + if (pgtable_l5_enabled()) + entropy = (rand % (entropy + 1)) & P4D_MASK; + else + entropy = (rand % (entropy + 1)) & PUD_MASK; + vaddr += entropy; + *kaslr_regions[i].base = vaddr; + + /* + * Jump the region and add a minimum padding based on + * randomization alignment. + */ + vaddr += get_padding(&kaslr_regions[i]); + if (pgtable_l5_enabled()) + vaddr = round_up(vaddr + 1, P4D_SIZE); + else + vaddr = round_up(vaddr + 1, PUD_SIZE); + remain_entropy -= entropy; + } +} + +static void __meminit init_trampoline_pud(void) +{ + unsigned long paddr, paddr_next; + pgd_t *pgd; + pud_t *pud_page, *pud_page_tramp; + int i; + + pud_page_tramp = alloc_low_page(); + + paddr = 0; + pgd = pgd_offset_k((unsigned long)__va(paddr)); + pud_page = (pud_t *) pgd_page_vaddr(*pgd); + + for (i = pud_index(paddr); i < PTRS_PER_PUD; i++, paddr = paddr_next) { + pud_t *pud, *pud_tramp; + unsigned long vaddr = (unsigned long)__va(paddr); + + pud_tramp = pud_page_tramp + pud_index(paddr); + pud = pud_page + pud_index(vaddr); + paddr_next = (paddr & PUD_MASK) + PUD_SIZE; + + *pud_tramp = *pud; + } + + set_pgd(&trampoline_pgd_entry, + __pgd(_KERNPG_TABLE | __pa(pud_page_tramp))); +} + +static void __meminit init_trampoline_p4d(void) +{ + unsigned long paddr, paddr_next; + pgd_t *pgd; + p4d_t *p4d_page, *p4d_page_tramp; + int i; + + p4d_page_tramp = alloc_low_page(); + + paddr = 0; + pgd = pgd_offset_k((unsigned long)__va(paddr)); + p4d_page = (p4d_t *) pgd_page_vaddr(*pgd); + + for (i = p4d_index(paddr); i < PTRS_PER_P4D; i++, paddr = paddr_next) { + p4d_t *p4d, *p4d_tramp; + unsigned long vaddr = (unsigned long)__va(paddr); + + p4d_tramp = p4d_page_tramp + p4d_index(paddr); + p4d = p4d_page + p4d_index(vaddr); + paddr_next = (paddr & P4D_MASK) + P4D_SIZE; + + *p4d_tramp = *p4d; + } + + set_pgd(&trampoline_pgd_entry, + __pgd(_KERNPG_TABLE | __pa(p4d_page_tramp))); +} + +/* + * Create PGD aligned trampoline table to allow real mode initialization + * of additional CPUs. Consume only 1 low memory page. + */ +void __meminit init_trampoline(void) +{ + + if (!kaslr_memory_enabled()) { + init_trampoline_default(); + return; + } + + if (pgtable_l5_enabled()) + init_trampoline_p4d(); + else + init_trampoline_pud(); +} diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c new file mode 100644 index 000000000..79eb55ce6 --- /dev/null +++ b/arch/x86/mm/kmmio.c @@ -0,0 +1,627 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Support for MMIO probes. + * Benfit many code from kprobes + * (C) 2002 Louis Zhuang <louis.zhuang@intel.com>. + * 2007 Alexander Eichner + * 2008 Pekka Paalanen <pq@iki.fi> + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/list.h> +#include <linux/rculist.h> +#include <linux/spinlock.h> +#include <linux/hash.h> +#include <linux/export.h> +#include <linux/kernel.h> +#include <linux/uaccess.h> +#include <linux/ptrace.h> +#include <linux/preempt.h> +#include <linux/percpu.h> +#include <linux/kdebug.h> +#include <linux/mutex.h> +#include <linux/io.h> +#include <linux/slab.h> +#include <asm/cacheflush.h> +#include <asm/tlbflush.h> +#include <linux/errno.h> +#include <asm/debugreg.h> +#include <linux/mmiotrace.h> + +#define KMMIO_PAGE_HASH_BITS 4 +#define KMMIO_PAGE_TABLE_SIZE (1 << KMMIO_PAGE_HASH_BITS) + +struct kmmio_fault_page { + struct list_head list; + struct kmmio_fault_page *release_next; + unsigned long addr; /* the requested address */ + pteval_t old_presence; /* page presence prior to arming */ + bool armed; + + /* + * Number of times this page has been registered as a part + * of a probe. If zero, page is disarmed and this may be freed. + * Used only by writers (RCU) and post_kmmio_handler(). + * Protected by kmmio_lock, when linked into kmmio_page_table. + */ + int count; + + bool scheduled_for_release; +}; + +struct kmmio_delayed_release { + struct rcu_head rcu; + struct kmmio_fault_page *release_list; +}; + +struct kmmio_context { + struct kmmio_fault_page *fpage; + struct kmmio_probe *probe; + unsigned long saved_flags; + unsigned long addr; + int active; +}; + +static DEFINE_SPINLOCK(kmmio_lock); + +/* Protected by kmmio_lock */ +unsigned int kmmio_count; + +/* Read-protected by RCU, write-protected by kmmio_lock. */ +static struct list_head kmmio_page_table[KMMIO_PAGE_TABLE_SIZE]; +static LIST_HEAD(kmmio_probes); + +static struct list_head *kmmio_page_list(unsigned long addr) +{ + unsigned int l; + pte_t *pte = lookup_address(addr, &l); + + if (!pte) + return NULL; + addr &= page_level_mask(l); + + return &kmmio_page_table[hash_long(addr, KMMIO_PAGE_HASH_BITS)]; +} + +/* Accessed per-cpu */ +static DEFINE_PER_CPU(struct kmmio_context, kmmio_ctx); + +/* + * this is basically a dynamic stabbing problem: + * Could use the existing prio tree code or + * Possible better implementations: + * The Interval Skip List: A Data Structure for Finding All Intervals That + * Overlap a Point (might be simple) + * Space Efficient Dynamic Stabbing with Fast Queries - Mikkel Thorup + */ +/* Get the kmmio at this addr (if any). You must be holding RCU read lock. */ +static struct kmmio_probe *get_kmmio_probe(unsigned long addr) +{ + struct kmmio_probe *p; + list_for_each_entry_rcu(p, &kmmio_probes, list) { + if (addr >= p->addr && addr < (p->addr + p->len)) + return p; + } + return NULL; +} + +/* You must be holding RCU read lock. */ +static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long addr) +{ + struct list_head *head; + struct kmmio_fault_page *f; + unsigned int l; + pte_t *pte = lookup_address(addr, &l); + + if (!pte) + return NULL; + addr &= page_level_mask(l); + head = kmmio_page_list(addr); + list_for_each_entry_rcu(f, head, list) { + if (f->addr == addr) + return f; + } + return NULL; +} + +static void clear_pmd_presence(pmd_t *pmd, bool clear, pmdval_t *old) +{ + pmd_t new_pmd; + pmdval_t v = pmd_val(*pmd); + if (clear) { + *old = v; + new_pmd = pmd_mknotpresent(*pmd); + } else { + /* Presume this has been called with clear==true previously */ + new_pmd = __pmd(*old); + } + set_pmd(pmd, new_pmd); +} + +static void clear_pte_presence(pte_t *pte, bool clear, pteval_t *old) +{ + pteval_t v = pte_val(*pte); + if (clear) { + *old = v; + /* Nothing should care about address */ + pte_clear(&init_mm, 0, pte); + } else { + /* Presume this has been called with clear==true previously */ + set_pte_atomic(pte, __pte(*old)); + } +} + +static int clear_page_presence(struct kmmio_fault_page *f, bool clear) +{ + unsigned int level; + pte_t *pte = lookup_address(f->addr, &level); + + if (!pte) { + pr_err("no pte for addr 0x%08lx\n", f->addr); + return -1; + } + + switch (level) { + case PG_LEVEL_2M: + clear_pmd_presence((pmd_t *)pte, clear, &f->old_presence); + break; + case PG_LEVEL_4K: + clear_pte_presence(pte, clear, &f->old_presence); + break; + default: + pr_err("unexpected page level 0x%x.\n", level); + return -1; + } + + __flush_tlb_one_kernel(f->addr); + return 0; +} + +/* + * Mark the given page as not present. Access to it will trigger a fault. + * + * Struct kmmio_fault_page is protected by RCU and kmmio_lock, but the + * protection is ignored here. RCU read lock is assumed held, so the struct + * will not disappear unexpectedly. Furthermore, the caller must guarantee, + * that double arming the same virtual address (page) cannot occur. + * + * Double disarming on the other hand is allowed, and may occur when a fault + * and mmiotrace shutdown happen simultaneously. + */ +static int arm_kmmio_fault_page(struct kmmio_fault_page *f) +{ + int ret; + WARN_ONCE(f->armed, KERN_ERR pr_fmt("kmmio page already armed.\n")); + if (f->armed) { + pr_warning("double-arm: addr 0x%08lx, ref %d, old %d\n", + f->addr, f->count, !!f->old_presence); + } + ret = clear_page_presence(f, true); + WARN_ONCE(ret < 0, KERN_ERR pr_fmt("arming at 0x%08lx failed.\n"), + f->addr); + f->armed = true; + return ret; +} + +/** Restore the given page to saved presence state. */ +static void disarm_kmmio_fault_page(struct kmmio_fault_page *f) +{ + int ret = clear_page_presence(f, false); + WARN_ONCE(ret < 0, + KERN_ERR "kmmio disarming at 0x%08lx failed.\n", f->addr); + f->armed = false; +} + +/* + * This is being called from do_page_fault(). + * + * We may be in an interrupt or a critical section. Also prefecthing may + * trigger a page fault. We may be in the middle of process switch. + * We cannot take any locks, because we could be executing especially + * within a kmmio critical section. + * + * Local interrupts are disabled, so preemption cannot happen. + * Do not enable interrupts, do not sleep, and watch out for other CPUs. + */ +/* + * Interrupts are disabled on entry as trap3 is an interrupt gate + * and they remain disabled throughout this function. + */ +int kmmio_handler(struct pt_regs *regs, unsigned long addr) +{ + struct kmmio_context *ctx; + struct kmmio_fault_page *faultpage; + int ret = 0; /* default to fault not handled */ + unsigned long page_base = addr; + unsigned int l; + pte_t *pte = lookup_address(addr, &l); + if (!pte) + return -EINVAL; + page_base &= page_level_mask(l); + + /* + * Preemption is now disabled to prevent process switch during + * single stepping. We can only handle one active kmmio trace + * per cpu, so ensure that we finish it before something else + * gets to run. We also hold the RCU read lock over single + * stepping to avoid looking up the probe and kmmio_fault_page + * again. + */ + preempt_disable(); + rcu_read_lock(); + + faultpage = get_kmmio_fault_page(page_base); + if (!faultpage) { + /* + * Either this page fault is not caused by kmmio, or + * another CPU just pulled the kmmio probe from under + * our feet. The latter case should not be possible. + */ + goto no_kmmio; + } + + ctx = &get_cpu_var(kmmio_ctx); + if (ctx->active) { + if (page_base == ctx->addr) { + /* + * A second fault on the same page means some other + * condition needs handling by do_page_fault(), the + * page really not being present is the most common. + */ + pr_debug("secondary hit for 0x%08lx CPU %d.\n", + addr, smp_processor_id()); + + if (!faultpage->old_presence) + pr_info("unexpected secondary hit for address 0x%08lx on CPU %d.\n", + addr, smp_processor_id()); + } else { + /* + * Prevent overwriting already in-flight context. + * This should not happen, let's hope disarming at + * least prevents a panic. + */ + pr_emerg("recursive probe hit on CPU %d, for address 0x%08lx. Ignoring.\n", + smp_processor_id(), addr); + pr_emerg("previous hit was at 0x%08lx.\n", ctx->addr); + disarm_kmmio_fault_page(faultpage); + } + goto no_kmmio_ctx; + } + ctx->active++; + + ctx->fpage = faultpage; + ctx->probe = get_kmmio_probe(page_base); + ctx->saved_flags = (regs->flags & (X86_EFLAGS_TF | X86_EFLAGS_IF)); + ctx->addr = page_base; + + if (ctx->probe && ctx->probe->pre_handler) + ctx->probe->pre_handler(ctx->probe, regs, addr); + + /* + * Enable single-stepping and disable interrupts for the faulting + * context. Local interrupts must not get enabled during stepping. + */ + regs->flags |= X86_EFLAGS_TF; + regs->flags &= ~X86_EFLAGS_IF; + + /* Now we set present bit in PTE and single step. */ + disarm_kmmio_fault_page(ctx->fpage); + + /* + * If another cpu accesses the same page while we are stepping, + * the access will not be caught. It will simply succeed and the + * only downside is we lose the event. If this becomes a problem, + * the user should drop to single cpu before tracing. + */ + + put_cpu_var(kmmio_ctx); + return 1; /* fault handled */ + +no_kmmio_ctx: + put_cpu_var(kmmio_ctx); +no_kmmio: + rcu_read_unlock(); + preempt_enable_no_resched(); + return ret; +} + +/* + * Interrupts are disabled on entry as trap1 is an interrupt gate + * and they remain disabled throughout this function. + * This must always get called as the pair to kmmio_handler(). + */ +static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs) +{ + int ret = 0; + struct kmmio_context *ctx = &get_cpu_var(kmmio_ctx); + + if (!ctx->active) { + /* + * debug traps without an active context are due to either + * something external causing them (f.e. using a debugger while + * mmio tracing enabled), or erroneous behaviour + */ + pr_warning("unexpected debug trap on CPU %d.\n", + smp_processor_id()); + goto out; + } + + if (ctx->probe && ctx->probe->post_handler) + ctx->probe->post_handler(ctx->probe, condition, regs); + + /* Prevent racing against release_kmmio_fault_page(). */ + spin_lock(&kmmio_lock); + if (ctx->fpage->count) + arm_kmmio_fault_page(ctx->fpage); + spin_unlock(&kmmio_lock); + + regs->flags &= ~X86_EFLAGS_TF; + regs->flags |= ctx->saved_flags; + + /* These were acquired in kmmio_handler(). */ + ctx->active--; + BUG_ON(ctx->active); + rcu_read_unlock(); + preempt_enable_no_resched(); + + /* + * if somebody else is singlestepping across a probe point, flags + * will have TF set, in which case, continue the remaining processing + * of do_debug, as if this is not a probe hit. + */ + if (!(regs->flags & X86_EFLAGS_TF)) + ret = 1; +out: + put_cpu_var(kmmio_ctx); + return ret; +} + +/* You must be holding kmmio_lock. */ +static int add_kmmio_fault_page(unsigned long addr) +{ + struct kmmio_fault_page *f; + + f = get_kmmio_fault_page(addr); + if (f) { + if (!f->count) + arm_kmmio_fault_page(f); + f->count++; + return 0; + } + + f = kzalloc(sizeof(*f), GFP_ATOMIC); + if (!f) + return -1; + + f->count = 1; + f->addr = addr; + + if (arm_kmmio_fault_page(f)) { + kfree(f); + return -1; + } + + list_add_rcu(&f->list, kmmio_page_list(f->addr)); + + return 0; +} + +/* You must be holding kmmio_lock. */ +static void release_kmmio_fault_page(unsigned long addr, + struct kmmio_fault_page **release_list) +{ + struct kmmio_fault_page *f; + + f = get_kmmio_fault_page(addr); + if (!f) + return; + + f->count--; + BUG_ON(f->count < 0); + if (!f->count) { + disarm_kmmio_fault_page(f); + if (!f->scheduled_for_release) { + f->release_next = *release_list; + *release_list = f; + f->scheduled_for_release = true; + } + } +} + +/* + * With page-unaligned ioremaps, one or two armed pages may contain + * addresses from outside the intended mapping. Events for these addresses + * are currently silently dropped. The events may result only from programming + * mistakes by accessing addresses before the beginning or past the end of a + * mapping. + */ +int register_kmmio_probe(struct kmmio_probe *p) +{ + unsigned long flags; + int ret = 0; + unsigned long size = 0; + unsigned long addr = p->addr & PAGE_MASK; + const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK); + unsigned int l; + pte_t *pte; + + spin_lock_irqsave(&kmmio_lock, flags); + if (get_kmmio_probe(addr)) { + ret = -EEXIST; + goto out; + } + + pte = lookup_address(addr, &l); + if (!pte) { + ret = -EINVAL; + goto out; + } + + kmmio_count++; + list_add_rcu(&p->list, &kmmio_probes); + while (size < size_lim) { + if (add_kmmio_fault_page(addr + size)) + pr_err("Unable to set page fault.\n"); + size += page_level_size(l); + } +out: + spin_unlock_irqrestore(&kmmio_lock, flags); + /* + * XXX: What should I do here? + * Here was a call to global_flush_tlb(), but it does not exist + * anymore. It seems it's not needed after all. + */ + return ret; +} +EXPORT_SYMBOL(register_kmmio_probe); + +static void rcu_free_kmmio_fault_pages(struct rcu_head *head) +{ + struct kmmio_delayed_release *dr = container_of( + head, + struct kmmio_delayed_release, + rcu); + struct kmmio_fault_page *f = dr->release_list; + while (f) { + struct kmmio_fault_page *next = f->release_next; + BUG_ON(f->count); + kfree(f); + f = next; + } + kfree(dr); +} + +static void remove_kmmio_fault_pages(struct rcu_head *head) +{ + struct kmmio_delayed_release *dr = + container_of(head, struct kmmio_delayed_release, rcu); + struct kmmio_fault_page *f = dr->release_list; + struct kmmio_fault_page **prevp = &dr->release_list; + unsigned long flags; + + spin_lock_irqsave(&kmmio_lock, flags); + while (f) { + if (!f->count) { + list_del_rcu(&f->list); + prevp = &f->release_next; + } else { + *prevp = f->release_next; + f->release_next = NULL; + f->scheduled_for_release = false; + } + f = *prevp; + } + spin_unlock_irqrestore(&kmmio_lock, flags); + + /* This is the real RCU destroy call. */ + call_rcu(&dr->rcu, rcu_free_kmmio_fault_pages); +} + +/* + * Remove a kmmio probe. You have to synchronize_rcu() before you can be + * sure that the callbacks will not be called anymore. Only after that + * you may actually release your struct kmmio_probe. + * + * Unregistering a kmmio fault page has three steps: + * 1. release_kmmio_fault_page() + * Disarm the page, wait a grace period to let all faults finish. + * 2. remove_kmmio_fault_pages() + * Remove the pages from kmmio_page_table. + * 3. rcu_free_kmmio_fault_pages() + * Actually free the kmmio_fault_page structs as with RCU. + */ +void unregister_kmmio_probe(struct kmmio_probe *p) +{ + unsigned long flags; + unsigned long size = 0; + unsigned long addr = p->addr & PAGE_MASK; + const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK); + struct kmmio_fault_page *release_list = NULL; + struct kmmio_delayed_release *drelease; + unsigned int l; + pte_t *pte; + + pte = lookup_address(addr, &l); + if (!pte) + return; + + spin_lock_irqsave(&kmmio_lock, flags); + while (size < size_lim) { + release_kmmio_fault_page(addr + size, &release_list); + size += page_level_size(l); + } + list_del_rcu(&p->list); + kmmio_count--; + spin_unlock_irqrestore(&kmmio_lock, flags); + + if (!release_list) + return; + + drelease = kmalloc(sizeof(*drelease), GFP_ATOMIC); + if (!drelease) { + pr_crit("leaking kmmio_fault_page objects.\n"); + return; + } + drelease->release_list = release_list; + + /* + * This is not really RCU here. We have just disarmed a set of + * pages so that they cannot trigger page faults anymore. However, + * we cannot remove the pages from kmmio_page_table, + * because a probe hit might be in flight on another CPU. The + * pages are collected into a list, and they will be removed from + * kmmio_page_table when it is certain that no probe hit related to + * these pages can be in flight. RCU grace period sounds like a + * good choice. + * + * If we removed the pages too early, kmmio page fault handler might + * not find the respective kmmio_fault_page and determine it's not + * a kmmio fault, when it actually is. This would lead to madness. + */ + call_rcu(&drelease->rcu, remove_kmmio_fault_pages); +} +EXPORT_SYMBOL(unregister_kmmio_probe); + +static int +kmmio_die_notifier(struct notifier_block *nb, unsigned long val, void *args) +{ + struct die_args *arg = args; + unsigned long* dr6_p = (unsigned long *)ERR_PTR(arg->err); + + if (val == DIE_DEBUG && (*dr6_p & DR_STEP)) + if (post_kmmio_handler(*dr6_p, arg->regs) == 1) { + /* + * Reset the BS bit in dr6 (pointed by args->err) to + * denote completion of processing + */ + *dr6_p &= ~DR_STEP; + return NOTIFY_STOP; + } + + return NOTIFY_DONE; +} + +static struct notifier_block nb_die = { + .notifier_call = kmmio_die_notifier +}; + +int kmmio_init(void) +{ + int i; + + for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++) + INIT_LIST_HEAD(&kmmio_page_table[i]); + + return register_die_notifier(&nb_die); +} + +void kmmio_cleanup(void) +{ + int i; + + unregister_die_notifier(&nb_die); + for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++) { + WARN_ONCE(!list_empty(&kmmio_page_table[i]), + KERN_ERR "kmmio_page_table not empty at cleanup, any further tracing will leak memory.\n"); + } +} diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c new file mode 100644 index 000000000..c0499c389 --- /dev/null +++ b/arch/x86/mm/mem_encrypt.c @@ -0,0 +1,400 @@ +/* + * AMD Memory Encryption Support + * + * Copyright (C) 2016 Advanced Micro Devices, Inc. + * + * Author: Tom Lendacky <thomas.lendacky@amd.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#define DISABLE_BRANCH_PROFILING + +#include <linux/linkage.h> +#include <linux/init.h> +#include <linux/mm.h> +#include <linux/dma-direct.h> +#include <linux/swiotlb.h> +#include <linux/mem_encrypt.h> + +#include <asm/tlbflush.h> +#include <asm/fixmap.h> +#include <asm/setup.h> +#include <asm/bootparam.h> +#include <asm/set_memory.h> +#include <asm/cacheflush.h> +#include <asm/processor-flags.h> +#include <asm/msr.h> +#include <asm/cmdline.h> + +#include "mm_internal.h" + +/* + * Since SME related variables are set early in the boot process they must + * reside in the .data section so as not to be zeroed out when the .bss + * section is later cleared. + */ +u64 sme_me_mask __section(.data) = 0; +EXPORT_SYMBOL(sme_me_mask); +DEFINE_STATIC_KEY_FALSE(sev_enable_key); +EXPORT_SYMBOL_GPL(sev_enable_key); + +bool sev_enabled __section(.data); + +/* Buffer used for early in-place encryption by BSP, no locking needed */ +static char sme_early_buffer[PAGE_SIZE] __aligned(PAGE_SIZE); + +/* + * This routine does not change the underlying encryption setting of the + * page(s) that map this memory. It assumes that eventually the memory is + * meant to be accessed as either encrypted or decrypted but the contents + * are currently not in the desired state. + * + * This routine follows the steps outlined in the AMD64 Architecture + * Programmer's Manual Volume 2, Section 7.10.8 Encrypt-in-Place. + */ +static void __init __sme_early_enc_dec(resource_size_t paddr, + unsigned long size, bool enc) +{ + void *src, *dst; + size_t len; + + if (!sme_me_mask) + return; + + wbinvd(); + + /* + * There are limited number of early mapping slots, so map (at most) + * one page at time. + */ + while (size) { + len = min_t(size_t, sizeof(sme_early_buffer), size); + + /* + * Create mappings for the current and desired format of + * the memory. Use a write-protected mapping for the source. + */ + src = enc ? early_memremap_decrypted_wp(paddr, len) : + early_memremap_encrypted_wp(paddr, len); + + dst = enc ? early_memremap_encrypted(paddr, len) : + early_memremap_decrypted(paddr, len); + + /* + * If a mapping can't be obtained to perform the operation, + * then eventual access of that area in the desired mode + * will cause a crash. + */ + BUG_ON(!src || !dst); + + /* + * Use a temporary buffer, of cache-line multiple size, to + * avoid data corruption as documented in the APM. + */ + memcpy(sme_early_buffer, src, len); + memcpy(dst, sme_early_buffer, len); + + early_memunmap(dst, len); + early_memunmap(src, len); + + paddr += len; + size -= len; + } +} + +void __init sme_early_encrypt(resource_size_t paddr, unsigned long size) +{ + __sme_early_enc_dec(paddr, size, true); +} + +void __init sme_early_decrypt(resource_size_t paddr, unsigned long size) +{ + __sme_early_enc_dec(paddr, size, false); +} + +static void __init __sme_early_map_unmap_mem(void *vaddr, unsigned long size, + bool map) +{ + unsigned long paddr = (unsigned long)vaddr - __PAGE_OFFSET; + pmdval_t pmd_flags, pmd; + + /* Use early_pmd_flags but remove the encryption mask */ + pmd_flags = __sme_clr(early_pmd_flags); + + do { + pmd = map ? (paddr & PMD_MASK) + pmd_flags : 0; + __early_make_pgtable((unsigned long)vaddr, pmd); + + vaddr += PMD_SIZE; + paddr += PMD_SIZE; + size = (size <= PMD_SIZE) ? 0 : size - PMD_SIZE; + } while (size); + + __native_flush_tlb(); +} + +void __init sme_unmap_bootdata(char *real_mode_data) +{ + struct boot_params *boot_data; + unsigned long cmdline_paddr; + + if (!sme_active()) + return; + + /* Get the command line address before unmapping the real_mode_data */ + boot_data = (struct boot_params *)real_mode_data; + cmdline_paddr = boot_data->hdr.cmd_line_ptr | ((u64)boot_data->ext_cmd_line_ptr << 32); + + __sme_early_map_unmap_mem(real_mode_data, sizeof(boot_params), false); + + if (!cmdline_paddr) + return; + + __sme_early_map_unmap_mem(__va(cmdline_paddr), COMMAND_LINE_SIZE, false); +} + +void __init sme_map_bootdata(char *real_mode_data) +{ + struct boot_params *boot_data; + unsigned long cmdline_paddr; + + if (!sme_active()) + return; + + __sme_early_map_unmap_mem(real_mode_data, sizeof(boot_params), true); + + /* Get the command line address after mapping the real_mode_data */ + boot_data = (struct boot_params *)real_mode_data; + cmdline_paddr = boot_data->hdr.cmd_line_ptr | ((u64)boot_data->ext_cmd_line_ptr << 32); + + if (!cmdline_paddr) + return; + + __sme_early_map_unmap_mem(__va(cmdline_paddr), COMMAND_LINE_SIZE, true); +} + +void __init sme_early_init(void) +{ + unsigned int i; + + if (!sme_me_mask) + return; + + early_pmd_flags = __sme_set(early_pmd_flags); + + __supported_pte_mask = __sme_set(__supported_pte_mask); + + /* Update the protection map with memory encryption mask */ + for (i = 0; i < ARRAY_SIZE(protection_map); i++) + protection_map[i] = pgprot_encrypted(protection_map[i]); + + if (sev_active()) + swiotlb_force = SWIOTLB_FORCE; +} + +static void __init __set_clr_pte_enc(pte_t *kpte, int level, bool enc) +{ + pgprot_t old_prot, new_prot; + unsigned long pfn, pa, size; + pte_t new_pte; + + switch (level) { + case PG_LEVEL_4K: + pfn = pte_pfn(*kpte); + old_prot = pte_pgprot(*kpte); + break; + case PG_LEVEL_2M: + pfn = pmd_pfn(*(pmd_t *)kpte); + old_prot = pmd_pgprot(*(pmd_t *)kpte); + break; + case PG_LEVEL_1G: + pfn = pud_pfn(*(pud_t *)kpte); + old_prot = pud_pgprot(*(pud_t *)kpte); + break; + default: + return; + } + + new_prot = old_prot; + if (enc) + pgprot_val(new_prot) |= _PAGE_ENC; + else + pgprot_val(new_prot) &= ~_PAGE_ENC; + + /* If prot is same then do nothing. */ + if (pgprot_val(old_prot) == pgprot_val(new_prot)) + return; + + pa = pfn << PAGE_SHIFT; + size = page_level_size(level); + + /* + * We are going to perform in-place en-/decryption and change the + * physical page attribute from C=1 to C=0 or vice versa. Flush the + * caches to ensure that data gets accessed with the correct C-bit. + */ + clflush_cache_range(__va(pa), size); + + /* Encrypt/decrypt the contents in-place */ + if (enc) + sme_early_encrypt(pa, size); + else + sme_early_decrypt(pa, size); + + /* Change the page encryption mask. */ + new_pte = pfn_pte(pfn, new_prot); + set_pte_atomic(kpte, new_pte); +} + +static int __init early_set_memory_enc_dec(unsigned long vaddr, + unsigned long size, bool enc) +{ + unsigned long vaddr_end, vaddr_next; + unsigned long psize, pmask; + int split_page_size_mask; + int level, ret; + pte_t *kpte; + + vaddr_next = vaddr; + vaddr_end = vaddr + size; + + for (; vaddr < vaddr_end; vaddr = vaddr_next) { + kpte = lookup_address(vaddr, &level); + if (!kpte || pte_none(*kpte)) { + ret = 1; + goto out; + } + + if (level == PG_LEVEL_4K) { + __set_clr_pte_enc(kpte, level, enc); + vaddr_next = (vaddr & PAGE_MASK) + PAGE_SIZE; + continue; + } + + psize = page_level_size(level); + pmask = page_level_mask(level); + + /* + * Check whether we can change the large page in one go. + * We request a split when the address is not aligned and + * the number of pages to set/clear encryption bit is smaller + * than the number of pages in the large page. + */ + if (vaddr == (vaddr & pmask) && + ((vaddr_end - vaddr) >= psize)) { + __set_clr_pte_enc(kpte, level, enc); + vaddr_next = (vaddr & pmask) + psize; + continue; + } + + /* + * The virtual address is part of a larger page, create the next + * level page table mapping (4K or 2M). If it is part of a 2M + * page then we request a split of the large page into 4K + * chunks. A 1GB large page is split into 2M pages, resp. + */ + if (level == PG_LEVEL_2M) + split_page_size_mask = 0; + else + split_page_size_mask = 1 << PG_LEVEL_2M; + + kernel_physical_mapping_init(__pa(vaddr & pmask), + __pa((vaddr_end & pmask) + psize), + split_page_size_mask); + } + + ret = 0; + +out: + __flush_tlb_all(); + return ret; +} + +int __init early_set_memory_decrypted(unsigned long vaddr, unsigned long size) +{ + return early_set_memory_enc_dec(vaddr, size, false); +} + +int __init early_set_memory_encrypted(unsigned long vaddr, unsigned long size) +{ + return early_set_memory_enc_dec(vaddr, size, true); +} + +/* + * SME and SEV are very similar but they are not the same, so there are + * times that the kernel will need to distinguish between SME and SEV. The + * sme_active() and sev_active() functions are used for this. When a + * distinction isn't needed, the mem_encrypt_active() function can be used. + * + * The trampoline code is a good example for this requirement. Before + * paging is activated, SME will access all memory as decrypted, but SEV + * will access all memory as encrypted. So, when APs are being brought + * up under SME the trampoline area cannot be encrypted, whereas under SEV + * the trampoline area must be encrypted. + */ +bool sme_active(void) +{ + return sme_me_mask && !sev_enabled; +} +EXPORT_SYMBOL(sme_active); + +bool sev_active(void) +{ + return sme_me_mask && sev_enabled; +} +EXPORT_SYMBOL(sev_active); + +/* Architecture __weak replacement functions */ +void __init mem_encrypt_free_decrypted_mem(void) +{ + unsigned long vaddr, vaddr_end, npages; + int r; + + vaddr = (unsigned long)__start_bss_decrypted_unused; + vaddr_end = (unsigned long)__end_bss_decrypted; + npages = (vaddr_end - vaddr) >> PAGE_SHIFT; + + /* + * The unused memory range was mapped decrypted, change the encryption + * attribute from decrypted to encrypted before freeing it. + */ + if (mem_encrypt_active()) { + r = set_memory_encrypted(vaddr, npages); + if (r) { + pr_warn("failed to free unused decrypted pages\n"); + return; + } + } + + free_init_pages("unused decrypted", vaddr, vaddr_end); +} + +void __init mem_encrypt_init(void) +{ + if (!sme_me_mask) + return; + + /* Call into SWIOTLB to update the SWIOTLB DMA buffers */ + swiotlb_update_mem_attributes(); + + /* + * With SEV, DMA operations cannot use encryption, we need to use + * SWIOTLB to bounce buffer DMA operation. + */ + if (sev_active()) + dma_ops = &swiotlb_dma_ops; + + /* + * With SEV, we need to unroll the rep string I/O instructions. + */ + if (sev_active()) + static_branch_enable(&sev_enable_key); + + pr_info("AMD %s active\n", + sev_active() ? "Secure Encrypted Virtualization (SEV)" + : "Secure Memory Encryption (SME)"); +} + diff --git a/arch/x86/mm/mem_encrypt_boot.S b/arch/x86/mm/mem_encrypt_boot.S new file mode 100644 index 000000000..40a608506 --- /dev/null +++ b/arch/x86/mm/mem_encrypt_boot.S @@ -0,0 +1,159 @@ +/* + * AMD Memory Encryption Support + * + * Copyright (C) 2016 Advanced Micro Devices, Inc. + * + * Author: Tom Lendacky <thomas.lendacky@amd.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/linkage.h> +#include <asm/pgtable.h> +#include <asm/page.h> +#include <asm/processor-flags.h> +#include <asm/msr-index.h> +#include <asm/nospec-branch.h> + + .text + .code64 +ENTRY(sme_encrypt_execute) + + /* + * Entry parameters: + * RDI - virtual address for the encrypted mapping + * RSI - virtual address for the decrypted mapping + * RDX - length to encrypt + * RCX - virtual address of the encryption workarea, including: + * - stack page (PAGE_SIZE) + * - encryption routine page (PAGE_SIZE) + * - intermediate copy buffer (PMD_PAGE_SIZE) + * R8 - physcial address of the pagetables to use for encryption + */ + + push %rbp + movq %rsp, %rbp /* RBP now has original stack pointer */ + + /* Set up a one page stack in the non-encrypted memory area */ + movq %rcx, %rax /* Workarea stack page */ + leaq PAGE_SIZE(%rax), %rsp /* Set new stack pointer */ + addq $PAGE_SIZE, %rax /* Workarea encryption routine */ + + push %r12 + movq %rdi, %r10 /* Encrypted area */ + movq %rsi, %r11 /* Decrypted area */ + movq %rdx, %r12 /* Area length */ + + /* Copy encryption routine into the workarea */ + movq %rax, %rdi /* Workarea encryption routine */ + leaq __enc_copy(%rip), %rsi /* Encryption routine */ + movq $(.L__enc_copy_end - __enc_copy), %rcx /* Encryption routine length */ + rep movsb + + /* Setup registers for call */ + movq %r10, %rdi /* Encrypted area */ + movq %r11, %rsi /* Decrypted area */ + movq %r8, %rdx /* Pagetables used for encryption */ + movq %r12, %rcx /* Area length */ + movq %rax, %r8 /* Workarea encryption routine */ + addq $PAGE_SIZE, %r8 /* Workarea intermediate copy buffer */ + + ANNOTATE_RETPOLINE_SAFE + call *%rax /* Call the encryption routine */ + + pop %r12 + + movq %rbp, %rsp /* Restore original stack pointer */ + pop %rbp + + ret +ENDPROC(sme_encrypt_execute) + +ENTRY(__enc_copy) +/* + * Routine used to encrypt memory in place. + * This routine must be run outside of the kernel proper since + * the kernel will be encrypted during the process. So this + * routine is defined here and then copied to an area outside + * of the kernel where it will remain and run decrypted + * during execution. + * + * On entry the registers must be: + * RDI - virtual address for the encrypted mapping + * RSI - virtual address for the decrypted mapping + * RDX - address of the pagetables to use for encryption + * RCX - length of area + * R8 - intermediate copy buffer + * + * RAX - points to this routine + * + * The area will be encrypted by copying from the non-encrypted + * memory space to an intermediate buffer and then copying from the + * intermediate buffer back to the encrypted memory space. The physical + * addresses of the two mappings are the same which results in the area + * being encrypted "in place". + */ + /* Enable the new page tables */ + mov %rdx, %cr3 + + /* Flush any global TLBs */ + mov %cr4, %rdx + andq $~X86_CR4_PGE, %rdx + mov %rdx, %cr4 + orq $X86_CR4_PGE, %rdx + mov %rdx, %cr4 + + push %r15 + push %r12 + + movq %rcx, %r9 /* Save area length */ + movq %rdi, %r10 /* Save encrypted area address */ + movq %rsi, %r11 /* Save decrypted area address */ + + /* Set the PAT register PA5 entry to write-protect */ + movl $MSR_IA32_CR_PAT, %ecx + rdmsr + mov %rdx, %r15 /* Save original PAT value */ + andl $0xffff00ff, %edx /* Clear PA5 */ + orl $0x00000500, %edx /* Set PA5 to WP */ + wrmsr + + wbinvd /* Invalidate any cache entries */ + + /* Copy/encrypt up to 2MB at a time */ + movq $PMD_PAGE_SIZE, %r12 +1: + cmpq %r12, %r9 + jnb 2f + movq %r9, %r12 + +2: + movq %r11, %rsi /* Source - decrypted area */ + movq %r8, %rdi /* Dest - intermediate copy buffer */ + movq %r12, %rcx + rep movsb + + movq %r8, %rsi /* Source - intermediate copy buffer */ + movq %r10, %rdi /* Dest - encrypted area */ + movq %r12, %rcx + rep movsb + + addq %r12, %r11 + addq %r12, %r10 + subq %r12, %r9 /* Kernel length decrement */ + jnz 1b /* Kernel length not zero? */ + + /* Restore PAT register */ + movl $MSR_IA32_CR_PAT, %ecx + rdmsr + mov %r15, %rdx /* Restore original PAT value */ + wrmsr + + pop %r12 + pop %r15 + + ret +.L__enc_copy_end: +ENDPROC(__enc_copy) diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/mm/mem_encrypt_identity.c new file mode 100644 index 000000000..650d5a6ca --- /dev/null +++ b/arch/x86/mm/mem_encrypt_identity.c @@ -0,0 +1,576 @@ +/* + * AMD Memory Encryption Support + * + * Copyright (C) 2016 Advanced Micro Devices, Inc. + * + * Author: Tom Lendacky <thomas.lendacky@amd.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#define DISABLE_BRANCH_PROFILING + +/* + * Since we're dealing with identity mappings, physical and virtual + * addresses are the same, so override these defines which are ultimately + * used by the headers in misc.h. + */ +#define __pa(x) ((unsigned long)(x)) +#define __va(x) ((void *)((unsigned long)(x))) + +/* + * Special hack: we have to be careful, because no indirections are + * allowed here, and paravirt_ops is a kind of one. As it will only run in + * baremetal anyway, we just keep it from happening. (This list needs to + * be extended when new paravirt and debugging variants are added.) + */ +#undef CONFIG_PARAVIRT +#undef CONFIG_PARAVIRT_SPINLOCKS + +/* + * This code runs before CPU feature bits are set. By default, the + * pgtable_l5_enabled() function uses bit X86_FEATURE_LA57 to determine if + * 5-level paging is active, so that won't work here. USE_EARLY_PGTABLE_L5 + * is provided to handle this situation and, instead, use a variable that + * has been set by the early boot code. + */ +#define USE_EARLY_PGTABLE_L5 + +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/mem_encrypt.h> + +#include <asm/setup.h> +#include <asm/sections.h> +#include <asm/cmdline.h> + +#include "mm_internal.h" + +#define PGD_FLAGS _KERNPG_TABLE_NOENC +#define P4D_FLAGS _KERNPG_TABLE_NOENC +#define PUD_FLAGS _KERNPG_TABLE_NOENC +#define PMD_FLAGS _KERNPG_TABLE_NOENC + +#define PMD_FLAGS_LARGE (__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL) + +#define PMD_FLAGS_DEC PMD_FLAGS_LARGE +#define PMD_FLAGS_DEC_WP ((PMD_FLAGS_DEC & ~_PAGE_LARGE_CACHE_MASK) | \ + (_PAGE_PAT_LARGE | _PAGE_PWT)) + +#define PMD_FLAGS_ENC (PMD_FLAGS_LARGE | _PAGE_ENC) + +#define PTE_FLAGS (__PAGE_KERNEL_EXEC & ~_PAGE_GLOBAL) + +#define PTE_FLAGS_DEC PTE_FLAGS +#define PTE_FLAGS_DEC_WP ((PTE_FLAGS_DEC & ~_PAGE_CACHE_MASK) | \ + (_PAGE_PAT | _PAGE_PWT)) + +#define PTE_FLAGS_ENC (PTE_FLAGS | _PAGE_ENC) + +struct sme_populate_pgd_data { + void *pgtable_area; + pgd_t *pgd; + + pmdval_t pmd_flags; + pteval_t pte_flags; + unsigned long paddr; + + unsigned long vaddr; + unsigned long vaddr_end; +}; + +static char sme_cmdline_arg[] __initdata = "mem_encrypt"; +static char sme_cmdline_on[] __initdata = "on"; +static char sme_cmdline_off[] __initdata = "off"; + +static void __init sme_clear_pgd(struct sme_populate_pgd_data *ppd) +{ + unsigned long pgd_start, pgd_end, pgd_size; + pgd_t *pgd_p; + + pgd_start = ppd->vaddr & PGDIR_MASK; + pgd_end = ppd->vaddr_end & PGDIR_MASK; + + pgd_size = (((pgd_end - pgd_start) / PGDIR_SIZE) + 1) * sizeof(pgd_t); + + pgd_p = ppd->pgd + pgd_index(ppd->vaddr); + + memset(pgd_p, 0, pgd_size); +} + +static pud_t __init *sme_prepare_pgd(struct sme_populate_pgd_data *ppd) +{ + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + + pgd = ppd->pgd + pgd_index(ppd->vaddr); + if (pgd_none(*pgd)) { + p4d = ppd->pgtable_area; + memset(p4d, 0, sizeof(*p4d) * PTRS_PER_P4D); + ppd->pgtable_area += sizeof(*p4d) * PTRS_PER_P4D; + set_pgd(pgd, __pgd(PGD_FLAGS | __pa(p4d))); + } + + p4d = p4d_offset(pgd, ppd->vaddr); + if (p4d_none(*p4d)) { + pud = ppd->pgtable_area; + memset(pud, 0, sizeof(*pud) * PTRS_PER_PUD); + ppd->pgtable_area += sizeof(*pud) * PTRS_PER_PUD; + set_p4d(p4d, __p4d(P4D_FLAGS | __pa(pud))); + } + + pud = pud_offset(p4d, ppd->vaddr); + if (pud_none(*pud)) { + pmd = ppd->pgtable_area; + memset(pmd, 0, sizeof(*pmd) * PTRS_PER_PMD); + ppd->pgtable_area += sizeof(*pmd) * PTRS_PER_PMD; + set_pud(pud, __pud(PUD_FLAGS | __pa(pmd))); + } + + if (pud_large(*pud)) + return NULL; + + return pud; +} + +static void __init sme_populate_pgd_large(struct sme_populate_pgd_data *ppd) +{ + pud_t *pud; + pmd_t *pmd; + + pud = sme_prepare_pgd(ppd); + if (!pud) + return; + + pmd = pmd_offset(pud, ppd->vaddr); + if (pmd_large(*pmd)) + return; + + set_pmd(pmd, __pmd(ppd->paddr | ppd->pmd_flags)); +} + +static void __init sme_populate_pgd(struct sme_populate_pgd_data *ppd) +{ + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + pud = sme_prepare_pgd(ppd); + if (!pud) + return; + + pmd = pmd_offset(pud, ppd->vaddr); + if (pmd_none(*pmd)) { + pte = ppd->pgtable_area; + memset(pte, 0, sizeof(*pte) * PTRS_PER_PTE); + ppd->pgtable_area += sizeof(*pte) * PTRS_PER_PTE; + set_pmd(pmd, __pmd(PMD_FLAGS | __pa(pte))); + } + + if (pmd_large(*pmd)) + return; + + pte = pte_offset_map(pmd, ppd->vaddr); + if (pte_none(*pte)) + set_pte(pte, __pte(ppd->paddr | ppd->pte_flags)); +} + +static void __init __sme_map_range_pmd(struct sme_populate_pgd_data *ppd) +{ + while (ppd->vaddr < ppd->vaddr_end) { + sme_populate_pgd_large(ppd); + + ppd->vaddr += PMD_PAGE_SIZE; + ppd->paddr += PMD_PAGE_SIZE; + } +} + +static void __init __sme_map_range_pte(struct sme_populate_pgd_data *ppd) +{ + while (ppd->vaddr < ppd->vaddr_end) { + sme_populate_pgd(ppd); + + ppd->vaddr += PAGE_SIZE; + ppd->paddr += PAGE_SIZE; + } +} + +static void __init __sme_map_range(struct sme_populate_pgd_data *ppd, + pmdval_t pmd_flags, pteval_t pte_flags) +{ + unsigned long vaddr_end; + + ppd->pmd_flags = pmd_flags; + ppd->pte_flags = pte_flags; + + /* Save original end value since we modify the struct value */ + vaddr_end = ppd->vaddr_end; + + /* If start is not 2MB aligned, create PTE entries */ + ppd->vaddr_end = ALIGN(ppd->vaddr, PMD_PAGE_SIZE); + __sme_map_range_pte(ppd); + + /* Create PMD entries */ + ppd->vaddr_end = vaddr_end & PMD_PAGE_MASK; + __sme_map_range_pmd(ppd); + + /* If end is not 2MB aligned, create PTE entries */ + ppd->vaddr_end = vaddr_end; + __sme_map_range_pte(ppd); +} + +static void __init sme_map_range_encrypted(struct sme_populate_pgd_data *ppd) +{ + __sme_map_range(ppd, PMD_FLAGS_ENC, PTE_FLAGS_ENC); +} + +static void __init sme_map_range_decrypted(struct sme_populate_pgd_data *ppd) +{ + __sme_map_range(ppd, PMD_FLAGS_DEC, PTE_FLAGS_DEC); +} + +static void __init sme_map_range_decrypted_wp(struct sme_populate_pgd_data *ppd) +{ + __sme_map_range(ppd, PMD_FLAGS_DEC_WP, PTE_FLAGS_DEC_WP); +} + +static unsigned long __init sme_pgtable_calc(unsigned long len) +{ + unsigned long entries = 0, tables = 0; + + /* + * Perform a relatively simplistic calculation of the pagetable + * entries that are needed. Those mappings will be covered mostly + * by 2MB PMD entries so we can conservatively calculate the required + * number of P4D, PUD and PMD structures needed to perform the + * mappings. For mappings that are not 2MB aligned, PTE mappings + * would be needed for the start and end portion of the address range + * that fall outside of the 2MB alignment. This results in, at most, + * two extra pages to hold PTE entries for each range that is mapped. + * Incrementing the count for each covers the case where the addresses + * cross entries. + */ + + /* PGDIR_SIZE is equal to P4D_SIZE on 4-level machine. */ + if (PTRS_PER_P4D > 1) + entries += (DIV_ROUND_UP(len, PGDIR_SIZE) + 1) * sizeof(p4d_t) * PTRS_PER_P4D; + entries += (DIV_ROUND_UP(len, P4D_SIZE) + 1) * sizeof(pud_t) * PTRS_PER_PUD; + entries += (DIV_ROUND_UP(len, PUD_SIZE) + 1) * sizeof(pmd_t) * PTRS_PER_PMD; + entries += 2 * sizeof(pte_t) * PTRS_PER_PTE; + + /* + * Now calculate the added pagetable structures needed to populate + * the new pagetables. + */ + + if (PTRS_PER_P4D > 1) + tables += DIV_ROUND_UP(entries, PGDIR_SIZE) * sizeof(p4d_t) * PTRS_PER_P4D; + tables += DIV_ROUND_UP(entries, P4D_SIZE) * sizeof(pud_t) * PTRS_PER_PUD; + tables += DIV_ROUND_UP(entries, PUD_SIZE) * sizeof(pmd_t) * PTRS_PER_PMD; + + return entries + tables; +} + +void __init sme_encrypt_kernel(struct boot_params *bp) +{ + unsigned long workarea_start, workarea_end, workarea_len; + unsigned long execute_start, execute_end, execute_len; + unsigned long kernel_start, kernel_end, kernel_len; + unsigned long initrd_start, initrd_end, initrd_len; + struct sme_populate_pgd_data ppd; + unsigned long pgtable_area_len; + unsigned long decrypted_base; + + if (!sme_active()) + return; + + /* + * Prepare for encrypting the kernel and initrd by building new + * pagetables with the necessary attributes needed to encrypt the + * kernel in place. + * + * One range of virtual addresses will map the memory occupied + * by the kernel and initrd as encrypted. + * + * Another range of virtual addresses will map the memory occupied + * by the kernel and initrd as decrypted and write-protected. + * + * The use of write-protect attribute will prevent any of the + * memory from being cached. + */ + + /* Physical addresses gives us the identity mapped virtual addresses */ + kernel_start = __pa_symbol(_text); + kernel_end = ALIGN(__pa_symbol(_end), PMD_PAGE_SIZE); + kernel_len = kernel_end - kernel_start; + + initrd_start = 0; + initrd_end = 0; + initrd_len = 0; +#ifdef CONFIG_BLK_DEV_INITRD + initrd_len = (unsigned long)bp->hdr.ramdisk_size | + ((unsigned long)bp->ext_ramdisk_size << 32); + if (initrd_len) { + initrd_start = (unsigned long)bp->hdr.ramdisk_image | + ((unsigned long)bp->ext_ramdisk_image << 32); + initrd_end = PAGE_ALIGN(initrd_start + initrd_len); + initrd_len = initrd_end - initrd_start; + } +#endif + + /* Set the encryption workarea to be immediately after the kernel */ + workarea_start = kernel_end; + + /* + * Calculate required number of workarea bytes needed: + * executable encryption area size: + * stack page (PAGE_SIZE) + * encryption routine page (PAGE_SIZE) + * intermediate copy buffer (PMD_PAGE_SIZE) + * pagetable structures for the encryption of the kernel + * pagetable structures for workarea (in case not currently mapped) + */ + execute_start = workarea_start; + execute_end = execute_start + (PAGE_SIZE * 2) + PMD_PAGE_SIZE; + execute_len = execute_end - execute_start; + + /* + * One PGD for both encrypted and decrypted mappings and a set of + * PUDs and PMDs for each of the encrypted and decrypted mappings. + */ + pgtable_area_len = sizeof(pgd_t) * PTRS_PER_PGD; + pgtable_area_len += sme_pgtable_calc(execute_end - kernel_start) * 2; + if (initrd_len) + pgtable_area_len += sme_pgtable_calc(initrd_len) * 2; + + /* PUDs and PMDs needed in the current pagetables for the workarea */ + pgtable_area_len += sme_pgtable_calc(execute_len + pgtable_area_len); + + /* + * The total workarea includes the executable encryption area and + * the pagetable area. The start of the workarea is already 2MB + * aligned, align the end of the workarea on a 2MB boundary so that + * we don't try to create/allocate PTE entries from the workarea + * before it is mapped. + */ + workarea_len = execute_len + pgtable_area_len; + workarea_end = ALIGN(workarea_start + workarea_len, PMD_PAGE_SIZE); + + /* + * Set the address to the start of where newly created pagetable + * structures (PGDs, PUDs and PMDs) will be allocated. New pagetable + * structures are created when the workarea is added to the current + * pagetables and when the new encrypted and decrypted kernel + * mappings are populated. + */ + ppd.pgtable_area = (void *)execute_end; + + /* + * Make sure the current pagetable structure has entries for + * addressing the workarea. + */ + ppd.pgd = (pgd_t *)native_read_cr3_pa(); + ppd.paddr = workarea_start; + ppd.vaddr = workarea_start; + ppd.vaddr_end = workarea_end; + sme_map_range_decrypted(&ppd); + + /* Flush the TLB - no globals so cr3 is enough */ + native_write_cr3(__native_read_cr3()); + + /* + * A new pagetable structure is being built to allow for the kernel + * and initrd to be encrypted. It starts with an empty PGD that will + * then be populated with new PUDs and PMDs as the encrypted and + * decrypted kernel mappings are created. + */ + ppd.pgd = ppd.pgtable_area; + memset(ppd.pgd, 0, sizeof(pgd_t) * PTRS_PER_PGD); + ppd.pgtable_area += sizeof(pgd_t) * PTRS_PER_PGD; + + /* + * A different PGD index/entry must be used to get different + * pagetable entries for the decrypted mapping. Choose the next + * PGD index and convert it to a virtual address to be used as + * the base of the mapping. + */ + decrypted_base = (pgd_index(workarea_end) + 1) & (PTRS_PER_PGD - 1); + if (initrd_len) { + unsigned long check_base; + + check_base = (pgd_index(initrd_end) + 1) & (PTRS_PER_PGD - 1); + decrypted_base = max(decrypted_base, check_base); + } + decrypted_base <<= PGDIR_SHIFT; + + /* Add encrypted kernel (identity) mappings */ + ppd.paddr = kernel_start; + ppd.vaddr = kernel_start; + ppd.vaddr_end = kernel_end; + sme_map_range_encrypted(&ppd); + + /* Add decrypted, write-protected kernel (non-identity) mappings */ + ppd.paddr = kernel_start; + ppd.vaddr = kernel_start + decrypted_base; + ppd.vaddr_end = kernel_end + decrypted_base; + sme_map_range_decrypted_wp(&ppd); + + if (initrd_len) { + /* Add encrypted initrd (identity) mappings */ + ppd.paddr = initrd_start; + ppd.vaddr = initrd_start; + ppd.vaddr_end = initrd_end; + sme_map_range_encrypted(&ppd); + /* + * Add decrypted, write-protected initrd (non-identity) mappings + */ + ppd.paddr = initrd_start; + ppd.vaddr = initrd_start + decrypted_base; + ppd.vaddr_end = initrd_end + decrypted_base; + sme_map_range_decrypted_wp(&ppd); + } + + /* Add decrypted workarea mappings to both kernel mappings */ + ppd.paddr = workarea_start; + ppd.vaddr = workarea_start; + ppd.vaddr_end = workarea_end; + sme_map_range_decrypted(&ppd); + + ppd.paddr = workarea_start; + ppd.vaddr = workarea_start + decrypted_base; + ppd.vaddr_end = workarea_end + decrypted_base; + sme_map_range_decrypted(&ppd); + + /* Perform the encryption */ + sme_encrypt_execute(kernel_start, kernel_start + decrypted_base, + kernel_len, workarea_start, (unsigned long)ppd.pgd); + + if (initrd_len) + sme_encrypt_execute(initrd_start, initrd_start + decrypted_base, + initrd_len, workarea_start, + (unsigned long)ppd.pgd); + + /* + * At this point we are running encrypted. Remove the mappings for + * the decrypted areas - all that is needed for this is to remove + * the PGD entry/entries. + */ + ppd.vaddr = kernel_start + decrypted_base; + ppd.vaddr_end = kernel_end + decrypted_base; + sme_clear_pgd(&ppd); + + if (initrd_len) { + ppd.vaddr = initrd_start + decrypted_base; + ppd.vaddr_end = initrd_end + decrypted_base; + sme_clear_pgd(&ppd); + } + + ppd.vaddr = workarea_start + decrypted_base; + ppd.vaddr_end = workarea_end + decrypted_base; + sme_clear_pgd(&ppd); + + /* Flush the TLB - no globals so cr3 is enough */ + native_write_cr3(__native_read_cr3()); +} + +void __init sme_enable(struct boot_params *bp) +{ + const char *cmdline_ptr, *cmdline_arg, *cmdline_on, *cmdline_off; + unsigned int eax, ebx, ecx, edx; + unsigned long feature_mask; + bool active_by_default; + unsigned long me_mask; + char buffer[16]; + u64 msr; + + /* Check for the SME/SEV support leaf */ + eax = 0x80000000; + ecx = 0; + native_cpuid(&eax, &ebx, &ecx, &edx); + if (eax < 0x8000001f) + return; + +#define AMD_SME_BIT BIT(0) +#define AMD_SEV_BIT BIT(1) + /* + * Set the feature mask (SME or SEV) based on whether we are + * running under a hypervisor. + */ + eax = 1; + ecx = 0; + native_cpuid(&eax, &ebx, &ecx, &edx); + feature_mask = (ecx & BIT(31)) ? AMD_SEV_BIT : AMD_SME_BIT; + + /* + * Check for the SME/SEV feature: + * CPUID Fn8000_001F[EAX] + * - Bit 0 - Secure Memory Encryption support + * - Bit 1 - Secure Encrypted Virtualization support + * CPUID Fn8000_001F[EBX] + * - Bits 5:0 - Pagetable bit position used to indicate encryption + */ + eax = 0x8000001f; + ecx = 0; + native_cpuid(&eax, &ebx, &ecx, &edx); + if (!(eax & feature_mask)) + return; + + me_mask = 1UL << (ebx & 0x3f); + + /* Check if memory encryption is enabled */ + if (feature_mask == AMD_SME_BIT) { + /* For SME, check the SYSCFG MSR */ + msr = __rdmsr(MSR_K8_SYSCFG); + if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT)) + return; + } else { + /* For SEV, check the SEV MSR */ + msr = __rdmsr(MSR_AMD64_SEV); + if (!(msr & MSR_AMD64_SEV_ENABLED)) + return; + + /* SEV state cannot be controlled by a command line option */ + sme_me_mask = me_mask; + sev_enabled = true; + physical_mask &= ~sme_me_mask; + return; + } + + /* + * Fixups have not been applied to phys_base yet and we're running + * identity mapped, so we must obtain the address to the SME command + * line argument data using rip-relative addressing. + */ + asm ("lea sme_cmdline_arg(%%rip), %0" + : "=r" (cmdline_arg) + : "p" (sme_cmdline_arg)); + asm ("lea sme_cmdline_on(%%rip), %0" + : "=r" (cmdline_on) + : "p" (sme_cmdline_on)); + asm ("lea sme_cmdline_off(%%rip), %0" + : "=r" (cmdline_off) + : "p" (sme_cmdline_off)); + + if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT)) + active_by_default = true; + else + active_by_default = false; + + cmdline_ptr = (const char *)((u64)bp->hdr.cmd_line_ptr | + ((u64)bp->ext_cmd_line_ptr << 32)); + + cmdline_find_option(cmdline_ptr, cmdline_arg, buffer, sizeof(buffer)); + + if (!strncmp(buffer, cmdline_on, sizeof(buffer))) + sme_me_mask = me_mask; + else if (!strncmp(buffer, cmdline_off, sizeof(buffer))) + sme_me_mask = 0; + else + sme_me_mask = active_by_default ? me_mask : 0; + + physical_mask &= ~sme_me_mask; +} diff --git a/arch/x86/mm/mm_internal.h b/arch/x86/mm/mm_internal.h new file mode 100644 index 000000000..4e1f6e1b8 --- /dev/null +++ b/arch/x86/mm/mm_internal.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __X86_MM_INTERNAL_H +#define __X86_MM_INTERNAL_H + +void *alloc_low_pages(unsigned int num); +static inline void *alloc_low_page(void) +{ + return alloc_low_pages(1); +} + +void early_ioremap_page_table_range_init(void); + +unsigned long kernel_physical_mapping_init(unsigned long start, + unsigned long end, + unsigned long page_size_mask); +void zone_sizes_init(void); + +extern int after_bootmem; + +void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache); + +#endif /* __X86_MM_INTERNAL_H */ diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c new file mode 100644 index 000000000..b69f7d428 --- /dev/null +++ b/arch/x86/mm/mmap.c @@ -0,0 +1,263 @@ +/* + * Flexible mmap layout support + * + * Based on code by Ingo Molnar and Andi Kleen, copyrighted + * as follows: + * + * Copyright 2003-2009 Red Hat Inc. + * All Rights Reserved. + * Copyright 2005 Andi Kleen, SUSE Labs. + * Copyright 2007 Jiri Kosina, SUSE Labs. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/personality.h> +#include <linux/mm.h> +#include <linux/random.h> +#include <linux/limits.h> +#include <linux/sched/signal.h> +#include <linux/sched/mm.h> +#include <linux/compat.h> +#include <asm/elf.h> + +#include "physaddr.h" + +struct va_alignment __read_mostly va_align = { + .flags = -1, +}; + +unsigned long task_size_32bit(void) +{ + return IA32_PAGE_OFFSET; +} + +unsigned long task_size_64bit(int full_addr_space) +{ + return full_addr_space ? TASK_SIZE_MAX : DEFAULT_MAP_WINDOW; +} + +static unsigned long stack_maxrandom_size(unsigned long task_size) +{ + unsigned long max = 0; + if (current->flags & PF_RANDOMIZE) { + max = (-1UL) & __STACK_RND_MASK(task_size == task_size_32bit()); + max <<= PAGE_SHIFT; + } + + return max; +} + +#ifdef CONFIG_COMPAT +# define mmap32_rnd_bits mmap_rnd_compat_bits +# define mmap64_rnd_bits mmap_rnd_bits +#else +# define mmap32_rnd_bits mmap_rnd_bits +# define mmap64_rnd_bits mmap_rnd_bits +#endif + +#define SIZE_128M (128 * 1024 * 1024UL) + +static int mmap_is_legacy(void) +{ + if (current->personality & ADDR_COMPAT_LAYOUT) + return 1; + + return sysctl_legacy_va_layout; +} + +static unsigned long arch_rnd(unsigned int rndbits) +{ + if (!(current->flags & PF_RANDOMIZE)) + return 0; + return (get_random_long() & ((1UL << rndbits) - 1)) << PAGE_SHIFT; +} + +unsigned long arch_mmap_rnd(void) +{ + return arch_rnd(mmap_is_ia32() ? mmap32_rnd_bits : mmap64_rnd_bits); +} + +static unsigned long mmap_base(unsigned long rnd, unsigned long task_size, + struct rlimit *rlim_stack) +{ + unsigned long gap = rlim_stack->rlim_cur; + unsigned long pad = stack_maxrandom_size(task_size) + stack_guard_gap; + unsigned long gap_min, gap_max; + + /* Values close to RLIM_INFINITY can overflow. */ + if (gap + pad > gap) + gap += pad; + + /* + * Top of mmap area (just below the process stack). + * Leave an at least ~128 MB hole with possible stack randomization. + */ + gap_min = SIZE_128M; + gap_max = (task_size / 6) * 5; + + if (gap < gap_min) + gap = gap_min; + else if (gap > gap_max) + gap = gap_max; + + return PAGE_ALIGN(task_size - gap - rnd); +} + +static unsigned long mmap_legacy_base(unsigned long rnd, + unsigned long task_size) +{ + return __TASK_UNMAPPED_BASE(task_size) + rnd; +} + +/* + * This function, called very early during the creation of a new + * process VM image, sets up which VM layout function to use: + */ +static void arch_pick_mmap_base(unsigned long *base, unsigned long *legacy_base, + unsigned long random_factor, unsigned long task_size, + struct rlimit *rlim_stack) +{ + *legacy_base = mmap_legacy_base(random_factor, task_size); + if (mmap_is_legacy()) + *base = *legacy_base; + else + *base = mmap_base(random_factor, task_size, rlim_stack); +} + +void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) +{ + if (mmap_is_legacy()) + mm->get_unmapped_area = arch_get_unmapped_area; + else + mm->get_unmapped_area = arch_get_unmapped_area_topdown; + + arch_pick_mmap_base(&mm->mmap_base, &mm->mmap_legacy_base, + arch_rnd(mmap64_rnd_bits), task_size_64bit(0), + rlim_stack); + +#ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES + /* + * The mmap syscall mapping base decision depends solely on the + * syscall type (64-bit or compat). This applies for 64bit + * applications and 32bit applications. The 64bit syscall uses + * mmap_base, the compat syscall uses mmap_compat_base. + */ + arch_pick_mmap_base(&mm->mmap_compat_base, &mm->mmap_compat_legacy_base, + arch_rnd(mmap32_rnd_bits), task_size_32bit(), + rlim_stack); +#endif +} + +unsigned long get_mmap_base(int is_legacy) +{ + struct mm_struct *mm = current->mm; + +#ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES + if (in_compat_syscall()) { + return is_legacy ? mm->mmap_compat_legacy_base + : mm->mmap_compat_base; + } +#endif + return is_legacy ? mm->mmap_legacy_base : mm->mmap_base; +} + +const char *arch_vma_name(struct vm_area_struct *vma) +{ + if (vma->vm_flags & VM_MPX) + return "[mpx]"; + return NULL; +} + +/** + * mmap_address_hint_valid - Validate the address hint of mmap + * @addr: Address hint + * @len: Mapping length + * + * Check whether @addr and @addr + @len result in a valid mapping. + * + * On 32bit this only checks whether @addr + @len is <= TASK_SIZE. + * + * On 64bit with 5-level page tables another sanity check is required + * because mappings requested by mmap(@addr, 0) which cross the 47-bit + * virtual address boundary can cause the following theoretical issue: + * + * An application calls mmap(addr, 0), i.e. without MAP_FIXED, where @addr + * is below the border of the 47-bit address space and @addr + @len is + * above the border. + * + * With 4-level paging this request succeeds, but the resulting mapping + * address will always be within the 47-bit virtual address space, because + * the hint address does not result in a valid mapping and is + * ignored. Hence applications which are not prepared to handle virtual + * addresses above 47-bit work correctly. + * + * With 5-level paging this request would be granted and result in a + * mapping which crosses the border of the 47-bit virtual address + * space. If the application cannot handle addresses above 47-bit this + * will lead to misbehaviour and hard to diagnose failures. + * + * Therefore ignore address hints which would result in a mapping crossing + * the 47-bit virtual address boundary. + * + * Note, that in the same scenario with MAP_FIXED the behaviour is + * different. The request with @addr < 47-bit and @addr + @len > 47-bit + * fails on a 4-level paging machine but succeeds on a 5-level paging + * machine. It is reasonable to expect that an application does not rely on + * the failure of such a fixed mapping request, so the restriction is not + * applied. + */ +bool mmap_address_hint_valid(unsigned long addr, unsigned long len) +{ + if (TASK_SIZE - len < addr) + return false; + + return (addr > DEFAULT_MAP_WINDOW) == (addr + len > DEFAULT_MAP_WINDOW); +} + +/* Can we access it for direct reading/writing? Must be RAM: */ +int valid_phys_addr_range(phys_addr_t addr, size_t count) +{ + return addr + count - 1 <= __pa(high_memory - 1); +} + +/* Can we access it through mmap? Must be a valid physical address: */ +int valid_mmap_phys_addr_range(unsigned long pfn, size_t count) +{ + phys_addr_t addr = (phys_addr_t)pfn << PAGE_SHIFT; + + return phys_addr_valid(addr + count - 1); +} + +/* + * Only allow root to set high MMIO mappings to PROT_NONE. + * This prevents an unpriv. user to set them to PROT_NONE and invert + * them, then pointing to valid memory for L1TF speculation. + * + * Note: for locked down kernels may want to disable the root override. + */ +bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot) +{ + if (!boot_cpu_has_bug(X86_BUG_L1TF)) + return true; + if (!__pte_needs_invert(pgprot_val(prot))) + return true; + /* If it's real memory always allow */ + if (pfn_valid(pfn)) + return true; + if (pfn >= l1tf_pfn_limit() && !capable(CAP_SYS_ADMIN)) + return false; + return true; +} diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c new file mode 100644 index 000000000..e32b003e0 --- /dev/null +++ b/arch/x86/mm/mmio-mod.c @@ -0,0 +1,477 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2005 + * Jeff Muizelaar, 2006, 2007 + * Pekka Paalanen, 2008 <pq@iki.fi> + * + * Derived from the read-mod example from relay-examples by Tom Zanussi. + */ + +#define pr_fmt(fmt) "mmiotrace: " fmt + +#define DEBUG 1 + +#include <linux/moduleparam.h> +#include <linux/debugfs.h> +#include <linux/slab.h> +#include <linux/uaccess.h> +#include <linux/io.h> +#include <asm/pgtable.h> +#include <linux/mmiotrace.h> +#include <asm/e820/api.h> /* for ISA_START_ADDRESS */ +#include <linux/atomic.h> +#include <linux/percpu.h> +#include <linux/cpu.h> + +#include "pf_in.h" + +struct trap_reason { + unsigned long addr; + unsigned long ip; + enum reason_type type; + int active_traces; +}; + +struct remap_trace { + struct list_head list; + struct kmmio_probe probe; + resource_size_t phys; + unsigned long id; +}; + +/* Accessed per-cpu. */ +static DEFINE_PER_CPU(struct trap_reason, pf_reason); +static DEFINE_PER_CPU(struct mmiotrace_rw, cpu_trace); + +static DEFINE_MUTEX(mmiotrace_mutex); +static DEFINE_SPINLOCK(trace_lock); +static atomic_t mmiotrace_enabled; +static LIST_HEAD(trace_list); /* struct remap_trace */ + +/* + * Locking in this file: + * - mmiotrace_mutex enforces enable/disable_mmiotrace() critical sections. + * - mmiotrace_enabled may be modified only when holding mmiotrace_mutex + * and trace_lock. + * - Routines depending on is_enabled() must take trace_lock. + * - trace_list users must hold trace_lock. + * - is_enabled() guarantees that mmio_trace_{rw,mapping} are allowed. + * - pre/post callbacks assume the effect of is_enabled() being true. + */ + +/* module parameters */ +static unsigned long filter_offset; +static bool nommiotrace; +static bool trace_pc; + +module_param(filter_offset, ulong, 0); +module_param(nommiotrace, bool, 0); +module_param(trace_pc, bool, 0); + +MODULE_PARM_DESC(filter_offset, "Start address of traced mappings."); +MODULE_PARM_DESC(nommiotrace, "Disable actual MMIO tracing."); +MODULE_PARM_DESC(trace_pc, "Record address of faulting instructions."); + +static bool is_enabled(void) +{ + return atomic_read(&mmiotrace_enabled); +} + +static void print_pte(unsigned long address) +{ + unsigned int level; + pte_t *pte = lookup_address(address, &level); + + if (!pte) { + pr_err("Error in %s: no pte for page 0x%08lx\n", + __func__, address); + return; + } + + if (level == PG_LEVEL_2M) { + pr_emerg("4MB pages are not currently supported: 0x%08lx\n", + address); + BUG(); + } + pr_info("pte for 0x%lx: 0x%llx 0x%llx\n", + address, + (unsigned long long)pte_val(*pte), + (unsigned long long)pte_val(*pte) & _PAGE_PRESENT); +} + +/* + * For some reason the pre/post pairs have been called in an + * unmatched order. Report and die. + */ +static void die_kmmio_nesting_error(struct pt_regs *regs, unsigned long addr) +{ + const struct trap_reason *my_reason = &get_cpu_var(pf_reason); + pr_emerg("unexpected fault for address: 0x%08lx, last fault for address: 0x%08lx\n", + addr, my_reason->addr); + print_pte(addr); + pr_emerg("faulting IP is at %pS\n", (void *)regs->ip); + pr_emerg("last faulting IP was at %pS\n", (void *)my_reason->ip); +#ifdef __i386__ + pr_emerg("eax: %08lx ebx: %08lx ecx: %08lx edx: %08lx\n", + regs->ax, regs->bx, regs->cx, regs->dx); + pr_emerg("esi: %08lx edi: %08lx ebp: %08lx esp: %08lx\n", + regs->si, regs->di, regs->bp, regs->sp); +#else + pr_emerg("rax: %016lx rcx: %016lx rdx: %016lx\n", + regs->ax, regs->cx, regs->dx); + pr_emerg("rsi: %016lx rdi: %016lx rbp: %016lx rsp: %016lx\n", + regs->si, regs->di, regs->bp, regs->sp); +#endif + put_cpu_var(pf_reason); + BUG(); +} + +static void pre(struct kmmio_probe *p, struct pt_regs *regs, + unsigned long addr) +{ + struct trap_reason *my_reason = &get_cpu_var(pf_reason); + struct mmiotrace_rw *my_trace = &get_cpu_var(cpu_trace); + const unsigned long instptr = instruction_pointer(regs); + const enum reason_type type = get_ins_type(instptr); + struct remap_trace *trace = p->private; + + /* it doesn't make sense to have more than one active trace per cpu */ + if (my_reason->active_traces) + die_kmmio_nesting_error(regs, addr); + else + my_reason->active_traces++; + + my_reason->type = type; + my_reason->addr = addr; + my_reason->ip = instptr; + + my_trace->phys = addr - trace->probe.addr + trace->phys; + my_trace->map_id = trace->id; + + /* + * Only record the program counter when requested. + * It may taint clean-room reverse engineering. + */ + if (trace_pc) + my_trace->pc = instptr; + else + my_trace->pc = 0; + + /* + * XXX: the timestamp recorded will be *after* the tracing has been + * done, not at the time we hit the instruction. SMP implications + * on event ordering? + */ + + switch (type) { + case REG_READ: + my_trace->opcode = MMIO_READ; + my_trace->width = get_ins_mem_width(instptr); + break; + case REG_WRITE: + my_trace->opcode = MMIO_WRITE; + my_trace->width = get_ins_mem_width(instptr); + my_trace->value = get_ins_reg_val(instptr, regs); + break; + case IMM_WRITE: + my_trace->opcode = MMIO_WRITE; + my_trace->width = get_ins_mem_width(instptr); + my_trace->value = get_ins_imm_val(instptr); + break; + default: + { + unsigned char *ip = (unsigned char *)instptr; + my_trace->opcode = MMIO_UNKNOWN_OP; + my_trace->width = 0; + my_trace->value = (*ip) << 16 | *(ip + 1) << 8 | + *(ip + 2); + } + } + put_cpu_var(cpu_trace); + put_cpu_var(pf_reason); +} + +static void post(struct kmmio_probe *p, unsigned long condition, + struct pt_regs *regs) +{ + struct trap_reason *my_reason = &get_cpu_var(pf_reason); + struct mmiotrace_rw *my_trace = &get_cpu_var(cpu_trace); + + /* this should always return the active_trace count to 0 */ + my_reason->active_traces--; + if (my_reason->active_traces) { + pr_emerg("unexpected post handler"); + BUG(); + } + + switch (my_reason->type) { + case REG_READ: + my_trace->value = get_ins_reg_val(my_reason->ip, regs); + break; + default: + break; + } + + mmio_trace_rw(my_trace); + put_cpu_var(cpu_trace); + put_cpu_var(pf_reason); +} + +static void ioremap_trace_core(resource_size_t offset, unsigned long size, + void __iomem *addr) +{ + static atomic_t next_id; + struct remap_trace *trace = kmalloc(sizeof(*trace), GFP_KERNEL); + /* These are page-unaligned. */ + struct mmiotrace_map map = { + .phys = offset, + .virt = (unsigned long)addr, + .len = size, + .opcode = MMIO_PROBE + }; + + if (!trace) { + pr_err("kmalloc failed in ioremap\n"); + return; + } + + *trace = (struct remap_trace) { + .probe = { + .addr = (unsigned long)addr, + .len = size, + .pre_handler = pre, + .post_handler = post, + .private = trace + }, + .phys = offset, + .id = atomic_inc_return(&next_id) + }; + map.map_id = trace->id; + + spin_lock_irq(&trace_lock); + if (!is_enabled()) { + kfree(trace); + goto not_enabled; + } + + mmio_trace_mapping(&map); + list_add_tail(&trace->list, &trace_list); + if (!nommiotrace) + register_kmmio_probe(&trace->probe); + +not_enabled: + spin_unlock_irq(&trace_lock); +} + +void mmiotrace_ioremap(resource_size_t offset, unsigned long size, + void __iomem *addr) +{ + if (!is_enabled()) /* recheck and proper locking in *_core() */ + return; + + pr_debug("ioremap_*(0x%llx, 0x%lx) = %p\n", + (unsigned long long)offset, size, addr); + if ((filter_offset) && (offset != filter_offset)) + return; + ioremap_trace_core(offset, size, addr); +} + +static void iounmap_trace_core(volatile void __iomem *addr) +{ + struct mmiotrace_map map = { + .phys = 0, + .virt = (unsigned long)addr, + .len = 0, + .opcode = MMIO_UNPROBE + }; + struct remap_trace *trace; + struct remap_trace *tmp; + struct remap_trace *found_trace = NULL; + + pr_debug("Unmapping %p.\n", addr); + + spin_lock_irq(&trace_lock); + if (!is_enabled()) + goto not_enabled; + + list_for_each_entry_safe(trace, tmp, &trace_list, list) { + if ((unsigned long)addr == trace->probe.addr) { + if (!nommiotrace) + unregister_kmmio_probe(&trace->probe); + list_del(&trace->list); + found_trace = trace; + break; + } + } + map.map_id = (found_trace) ? found_trace->id : -1; + mmio_trace_mapping(&map); + +not_enabled: + spin_unlock_irq(&trace_lock); + if (found_trace) { + synchronize_rcu(); /* unregister_kmmio_probe() requirement */ + kfree(found_trace); + } +} + +void mmiotrace_iounmap(volatile void __iomem *addr) +{ + might_sleep(); + if (is_enabled()) /* recheck and proper locking in *_core() */ + iounmap_trace_core(addr); +} + +int mmiotrace_printk(const char *fmt, ...) +{ + int ret = 0; + va_list args; + unsigned long flags; + va_start(args, fmt); + + spin_lock_irqsave(&trace_lock, flags); + if (is_enabled()) + ret = mmio_trace_printk(fmt, args); + spin_unlock_irqrestore(&trace_lock, flags); + + va_end(args); + return ret; +} +EXPORT_SYMBOL(mmiotrace_printk); + +static void clear_trace_list(void) +{ + struct remap_trace *trace; + struct remap_trace *tmp; + + /* + * No locking required, because the caller ensures we are in a + * critical section via mutex, and is_enabled() is false, + * i.e. nothing can traverse or modify this list. + * Caller also ensures is_enabled() cannot change. + */ + list_for_each_entry(trace, &trace_list, list) { + pr_notice("purging non-iounmapped trace @0x%08lx, size 0x%lx.\n", + trace->probe.addr, trace->probe.len); + if (!nommiotrace) + unregister_kmmio_probe(&trace->probe); + } + synchronize_rcu(); /* unregister_kmmio_probe() requirement */ + + list_for_each_entry_safe(trace, tmp, &trace_list, list) { + list_del(&trace->list); + kfree(trace); + } +} + +#ifdef CONFIG_HOTPLUG_CPU +static cpumask_var_t downed_cpus; + +static void enter_uniprocessor(void) +{ + int cpu; + int err; + + if (!cpumask_available(downed_cpus) && + !alloc_cpumask_var(&downed_cpus, GFP_KERNEL)) { + pr_notice("Failed to allocate mask\n"); + goto out; + } + + get_online_cpus(); + cpumask_copy(downed_cpus, cpu_online_mask); + cpumask_clear_cpu(cpumask_first(cpu_online_mask), downed_cpus); + if (num_online_cpus() > 1) + pr_notice("Disabling non-boot CPUs...\n"); + put_online_cpus(); + + for_each_cpu(cpu, downed_cpus) { + err = cpu_down(cpu); + if (!err) + pr_info("CPU%d is down.\n", cpu); + else + pr_err("Error taking CPU%d down: %d\n", cpu, err); + } +out: + if (num_online_cpus() > 1) + pr_warning("multiple CPUs still online, may miss events.\n"); +} + +static void leave_uniprocessor(void) +{ + int cpu; + int err; + + if (!cpumask_available(downed_cpus) || cpumask_weight(downed_cpus) == 0) + return; + pr_notice("Re-enabling CPUs...\n"); + for_each_cpu(cpu, downed_cpus) { + err = cpu_up(cpu); + if (!err) + pr_info("enabled CPU%d.\n", cpu); + else + pr_err("cannot re-enable CPU%d: %d\n", cpu, err); + } +} + +#else /* !CONFIG_HOTPLUG_CPU */ +static void enter_uniprocessor(void) +{ + if (num_online_cpus() > 1) + pr_warning("multiple CPUs are online, may miss events. " + "Suggest booting with maxcpus=1 kernel argument.\n"); +} + +static void leave_uniprocessor(void) +{ +} +#endif + +void enable_mmiotrace(void) +{ + mutex_lock(&mmiotrace_mutex); + if (is_enabled()) + goto out; + + if (nommiotrace) + pr_info("MMIO tracing disabled.\n"); + kmmio_init(); + enter_uniprocessor(); + spin_lock_irq(&trace_lock); + atomic_inc(&mmiotrace_enabled); + spin_unlock_irq(&trace_lock); + pr_info("enabled.\n"); +out: + mutex_unlock(&mmiotrace_mutex); +} + +void disable_mmiotrace(void) +{ + mutex_lock(&mmiotrace_mutex); + if (!is_enabled()) + goto out; + + spin_lock_irq(&trace_lock); + atomic_dec(&mmiotrace_enabled); + BUG_ON(is_enabled()); + spin_unlock_irq(&trace_lock); + + clear_trace_list(); /* guarantees: no more kmmio callbacks */ + leave_uniprocessor(); + kmmio_cleanup(); + pr_info("disabled.\n"); +out: + mutex_unlock(&mmiotrace_mutex); +} diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c new file mode 100644 index 000000000..e500949ba --- /dev/null +++ b/arch/x86/mm/mpx.c @@ -0,0 +1,948 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * mpx.c - Memory Protection eXtensions + * + * Copyright (c) 2014, Intel Corporation. + * Qiaowei Ren <qiaowei.ren@intel.com> + * Dave Hansen <dave.hansen@intel.com> + */ +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/mm_types.h> +#include <linux/syscalls.h> +#include <linux/sched/sysctl.h> + +#include <asm/insn.h> +#include <asm/insn-eval.h> +#include <asm/mman.h> +#include <asm/mmu_context.h> +#include <asm/mpx.h> +#include <asm/processor.h> +#include <asm/fpu/internal.h> + +#define CREATE_TRACE_POINTS +#include <asm/trace/mpx.h> + +static inline unsigned long mpx_bd_size_bytes(struct mm_struct *mm) +{ + if (is_64bit_mm(mm)) + return MPX_BD_SIZE_BYTES_64; + else + return MPX_BD_SIZE_BYTES_32; +} + +static inline unsigned long mpx_bt_size_bytes(struct mm_struct *mm) +{ + if (is_64bit_mm(mm)) + return MPX_BT_SIZE_BYTES_64; + else + return MPX_BT_SIZE_BYTES_32; +} + +/* + * This is really a simplified "vm_mmap". it only handles MPX + * bounds tables (the bounds directory is user-allocated). + */ +static unsigned long mpx_mmap(unsigned long len) +{ + struct mm_struct *mm = current->mm; + unsigned long addr, populate; + + /* Only bounds table can be allocated here */ + if (len != mpx_bt_size_bytes(mm)) + return -EINVAL; + + down_write(&mm->mmap_sem); + addr = do_mmap(NULL, 0, len, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, VM_MPX, 0, &populate, NULL); + up_write(&mm->mmap_sem); + if (populate) + mm_populate(addr, populate); + + return addr; +} + +static int mpx_insn_decode(struct insn *insn, + struct pt_regs *regs) +{ + unsigned char buf[MAX_INSN_SIZE]; + int x86_64 = !test_thread_flag(TIF_IA32); + int not_copied; + int nr_copied; + + not_copied = copy_from_user(buf, (void __user *)regs->ip, sizeof(buf)); + nr_copied = sizeof(buf) - not_copied; + /* + * The decoder _should_ fail nicely if we pass it a short buffer. + * But, let's not depend on that implementation detail. If we + * did not get anything, just error out now. + */ + if (!nr_copied) + return -EFAULT; + insn_init(insn, buf, nr_copied, x86_64); + insn_get_length(insn); + /* + * copy_from_user() tries to get as many bytes as we could see in + * the largest possible instruction. If the instruction we are + * after is shorter than that _and_ we attempt to copy from + * something unreadable, we might get a short read. This is OK + * as long as the read did not stop in the middle of the + * instruction. Check to see if we got a partial instruction. + */ + if (nr_copied < insn->length) + return -EFAULT; + + insn_get_opcode(insn); + /* + * We only _really_ need to decode bndcl/bndcn/bndcu + * Error out on anything else. + */ + if (insn->opcode.bytes[0] != 0x0f) + goto bad_opcode; + if ((insn->opcode.bytes[1] != 0x1a) && + (insn->opcode.bytes[1] != 0x1b)) + goto bad_opcode; + + return 0; +bad_opcode: + return -EINVAL; +} + +/* + * If a bounds overflow occurs then a #BR is generated. This + * function decodes MPX instructions to get violation address + * and set this address into extended struct siginfo. + * + * Note that this is not a super precise way of doing this. + * Userspace could have, by the time we get here, written + * anything it wants in to the instructions. We can not + * trust anything about it. They might not be valid + * instructions or might encode invalid registers, etc... + * + * The caller is expected to kfree() the returned siginfo_t. + */ +siginfo_t *mpx_generate_siginfo(struct pt_regs *regs) +{ + const struct mpx_bndreg_state *bndregs; + const struct mpx_bndreg *bndreg; + siginfo_t *info = NULL; + struct insn insn; + uint8_t bndregno; + int err; + + err = mpx_insn_decode(&insn, regs); + if (err) + goto err_out; + + /* + * We know at this point that we are only dealing with + * MPX instructions. + */ + insn_get_modrm(&insn); + bndregno = X86_MODRM_REG(insn.modrm.value); + if (bndregno > 3) { + err = -EINVAL; + goto err_out; + } + /* get bndregs field from current task's xsave area */ + bndregs = get_xsave_field_ptr(XFEATURE_MASK_BNDREGS); + if (!bndregs) { + err = -EINVAL; + goto err_out; + } + /* now go select the individual register in the set of 4 */ + bndreg = &bndregs->bndreg[bndregno]; + + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (!info) { + err = -ENOMEM; + goto err_out; + } + /* + * The registers are always 64-bit, but the upper 32 + * bits are ignored in 32-bit mode. Also, note that the + * upper bounds are architecturally represented in 1's + * complement form. + * + * The 'unsigned long' cast is because the compiler + * complains when casting from integers to different-size + * pointers. + */ + info->si_lower = (void __user *)(unsigned long)bndreg->lower_bound; + info->si_upper = (void __user *)(unsigned long)~bndreg->upper_bound; + info->si_addr_lsb = 0; + info->si_signo = SIGSEGV; + info->si_errno = 0; + info->si_code = SEGV_BNDERR; + info->si_addr = insn_get_addr_ref(&insn, regs); + /* + * We were not able to extract an address from the instruction, + * probably because there was something invalid in it. + */ + if (info->si_addr == (void __user *)-1) { + err = -EINVAL; + goto err_out; + } + trace_mpx_bounds_register_exception(info->si_addr, bndreg); + return info; +err_out: + /* info might be NULL, but kfree() handles that */ + kfree(info); + return ERR_PTR(err); +} + +static __user void *mpx_get_bounds_dir(void) +{ + const struct mpx_bndcsr *bndcsr; + + if (!cpu_feature_enabled(X86_FEATURE_MPX)) + return MPX_INVALID_BOUNDS_DIR; + + /* + * The bounds directory pointer is stored in a register + * only accessible if we first do an xsave. + */ + bndcsr = get_xsave_field_ptr(XFEATURE_MASK_BNDCSR); + if (!bndcsr) + return MPX_INVALID_BOUNDS_DIR; + + /* + * Make sure the register looks valid by checking the + * enable bit. + */ + if (!(bndcsr->bndcfgu & MPX_BNDCFG_ENABLE_FLAG)) + return MPX_INVALID_BOUNDS_DIR; + + /* + * Lastly, mask off the low bits used for configuration + * flags, and return the address of the bounds table. + */ + return (void __user *)(unsigned long) + (bndcsr->bndcfgu & MPX_BNDCFG_ADDR_MASK); +} + +int mpx_enable_management(void) +{ + void __user *bd_base = MPX_INVALID_BOUNDS_DIR; + struct mm_struct *mm = current->mm; + int ret = 0; + + /* + * runtime in the userspace will be responsible for allocation of + * the bounds directory. Then, it will save the base of the bounds + * directory into XSAVE/XRSTOR Save Area and enable MPX through + * XRSTOR instruction. + * + * The copy_xregs_to_kernel() beneath get_xsave_field_ptr() is + * expected to be relatively expensive. Storing the bounds + * directory here means that we do not have to do xsave in the + * unmap path; we can just use mm->context.bd_addr instead. + */ + bd_base = mpx_get_bounds_dir(); + down_write(&mm->mmap_sem); + + /* MPX doesn't support addresses above 47 bits yet. */ + if (find_vma(mm, DEFAULT_MAP_WINDOW)) { + pr_warn_once("%s (%d): MPX cannot handle addresses " + "above 47-bits. Disabling.", + current->comm, current->pid); + ret = -ENXIO; + goto out; + } + mm->context.bd_addr = bd_base; + if (mm->context.bd_addr == MPX_INVALID_BOUNDS_DIR) + ret = -ENXIO; +out: + up_write(&mm->mmap_sem); + return ret; +} + +int mpx_disable_management(void) +{ + struct mm_struct *mm = current->mm; + + if (!cpu_feature_enabled(X86_FEATURE_MPX)) + return -ENXIO; + + down_write(&mm->mmap_sem); + mm->context.bd_addr = MPX_INVALID_BOUNDS_DIR; + up_write(&mm->mmap_sem); + return 0; +} + +static int mpx_cmpxchg_bd_entry(struct mm_struct *mm, + unsigned long *curval, + unsigned long __user *addr, + unsigned long old_val, unsigned long new_val) +{ + int ret; + /* + * user_atomic_cmpxchg_inatomic() actually uses sizeof() + * the pointer that we pass to it to figure out how much + * data to cmpxchg. We have to be careful here not to + * pass a pointer to a 64-bit data type when we only want + * a 32-bit copy. + */ + if (is_64bit_mm(mm)) { + ret = user_atomic_cmpxchg_inatomic(curval, + addr, old_val, new_val); + } else { + u32 uninitialized_var(curval_32); + u32 old_val_32 = old_val; + u32 new_val_32 = new_val; + u32 __user *addr_32 = (u32 __user *)addr; + + ret = user_atomic_cmpxchg_inatomic(&curval_32, + addr_32, old_val_32, new_val_32); + *curval = curval_32; + } + return ret; +} + +/* + * With 32-bit mode, a bounds directory is 4MB, and the size of each + * bounds table is 16KB. With 64-bit mode, a bounds directory is 2GB, + * and the size of each bounds table is 4MB. + */ +static int allocate_bt(struct mm_struct *mm, long __user *bd_entry) +{ + unsigned long expected_old_val = 0; + unsigned long actual_old_val = 0; + unsigned long bt_addr; + unsigned long bd_new_entry; + int ret = 0; + + /* + * Carve the virtual space out of userspace for the new + * bounds table: + */ + bt_addr = mpx_mmap(mpx_bt_size_bytes(mm)); + if (IS_ERR((void *)bt_addr)) + return PTR_ERR((void *)bt_addr); + /* + * Set the valid flag (kinda like _PAGE_PRESENT in a pte) + */ + bd_new_entry = bt_addr | MPX_BD_ENTRY_VALID_FLAG; + + /* + * Go poke the address of the new bounds table in to the + * bounds directory entry out in userspace memory. Note: + * we may race with another CPU instantiating the same table. + * In that case the cmpxchg will see an unexpected + * 'actual_old_val'. + * + * This can fault, but that's OK because we do not hold + * mmap_sem at this point, unlike some of the other part + * of the MPX code that have to pagefault_disable(). + */ + ret = mpx_cmpxchg_bd_entry(mm, &actual_old_val, bd_entry, + expected_old_val, bd_new_entry); + if (ret) + goto out_unmap; + + /* + * The user_atomic_cmpxchg_inatomic() will only return nonzero + * for faults, *not* if the cmpxchg itself fails. Now we must + * verify that the cmpxchg itself completed successfully. + */ + /* + * We expected an empty 'expected_old_val', but instead found + * an apparently valid entry. Assume we raced with another + * thread to instantiate this table and desclare succecss. + */ + if (actual_old_val & MPX_BD_ENTRY_VALID_FLAG) { + ret = 0; + goto out_unmap; + } + /* + * We found a non-empty bd_entry but it did not have the + * VALID_FLAG set. Return an error which will result in + * a SEGV since this probably means that somebody scribbled + * some invalid data in to a bounds table. + */ + if (expected_old_val != actual_old_val) { + ret = -EINVAL; + goto out_unmap; + } + trace_mpx_new_bounds_table(bt_addr); + return 0; +out_unmap: + vm_munmap(bt_addr, mpx_bt_size_bytes(mm)); + return ret; +} + +/* + * When a BNDSTX instruction attempts to save bounds to a bounds + * table, it will first attempt to look up the table in the + * first-level bounds directory. If it does not find a table in + * the directory, a #BR is generated and we get here in order to + * allocate a new table. + * + * With 32-bit mode, the size of BD is 4MB, and the size of each + * bound table is 16KB. With 64-bit mode, the size of BD is 2GB, + * and the size of each bound table is 4MB. + */ +static int do_mpx_bt_fault(void) +{ + unsigned long bd_entry, bd_base; + const struct mpx_bndcsr *bndcsr; + struct mm_struct *mm = current->mm; + + bndcsr = get_xsave_field_ptr(XFEATURE_MASK_BNDCSR); + if (!bndcsr) + return -EINVAL; + /* + * Mask off the preserve and enable bits + */ + bd_base = bndcsr->bndcfgu & MPX_BNDCFG_ADDR_MASK; + /* + * The hardware provides the address of the missing or invalid + * entry via BNDSTATUS, so we don't have to go look it up. + */ + bd_entry = bndcsr->bndstatus & MPX_BNDSTA_ADDR_MASK; + /* + * Make sure the directory entry is within where we think + * the directory is. + */ + if ((bd_entry < bd_base) || + (bd_entry >= bd_base + mpx_bd_size_bytes(mm))) + return -EINVAL; + + return allocate_bt(mm, (long __user *)bd_entry); +} + +int mpx_handle_bd_fault(void) +{ + /* + * Userspace never asked us to manage the bounds tables, + * so refuse to help. + */ + if (!kernel_managing_mpx_tables(current->mm)) + return -EINVAL; + + return do_mpx_bt_fault(); +} + +/* + * A thin wrapper around get_user_pages(). Returns 0 if the + * fault was resolved or -errno if not. + */ +static int mpx_resolve_fault(long __user *addr, int write) +{ + long gup_ret; + int nr_pages = 1; + + gup_ret = get_user_pages((unsigned long)addr, nr_pages, + write ? FOLL_WRITE : 0, NULL, NULL); + /* + * get_user_pages() returns number of pages gotten. + * 0 means we failed to fault in and get anything, + * probably because 'addr' is bad. + */ + if (!gup_ret) + return -EFAULT; + /* Other error, return it */ + if (gup_ret < 0) + return gup_ret; + /* must have gup'd a page and gup_ret>0, success */ + return 0; +} + +static unsigned long mpx_bd_entry_to_bt_addr(struct mm_struct *mm, + unsigned long bd_entry) +{ + unsigned long bt_addr = bd_entry; + int align_to_bytes; + /* + * Bit 0 in a bt_entry is always the valid bit. + */ + bt_addr &= ~MPX_BD_ENTRY_VALID_FLAG; + /* + * Tables are naturally aligned at 8-byte boundaries + * on 64-bit and 4-byte boundaries on 32-bit. The + * documentation makes it appear that the low bits + * are ignored by the hardware, so we do the same. + */ + if (is_64bit_mm(mm)) + align_to_bytes = 8; + else + align_to_bytes = 4; + bt_addr &= ~(align_to_bytes-1); + return bt_addr; +} + +/* + * We only want to do a 4-byte get_user() on 32-bit. Otherwise, + * we might run off the end of the bounds table if we are on + * a 64-bit kernel and try to get 8 bytes. + */ +static int get_user_bd_entry(struct mm_struct *mm, unsigned long *bd_entry_ret, + long __user *bd_entry_ptr) +{ + u32 bd_entry_32; + int ret; + + if (is_64bit_mm(mm)) + return get_user(*bd_entry_ret, bd_entry_ptr); + + /* + * Note that get_user() uses the type of the *pointer* to + * establish the size of the get, not the destination. + */ + ret = get_user(bd_entry_32, (u32 __user *)bd_entry_ptr); + *bd_entry_ret = bd_entry_32; + return ret; +} + +/* + * Get the base of bounds tables pointed by specific bounds + * directory entry. + */ +static int get_bt_addr(struct mm_struct *mm, + long __user *bd_entry_ptr, + unsigned long *bt_addr_result) +{ + int ret; + int valid_bit; + unsigned long bd_entry; + unsigned long bt_addr; + + if (!access_ok(VERIFY_READ, (bd_entry_ptr), sizeof(*bd_entry_ptr))) + return -EFAULT; + + while (1) { + int need_write = 0; + + pagefault_disable(); + ret = get_user_bd_entry(mm, &bd_entry, bd_entry_ptr); + pagefault_enable(); + if (!ret) + break; + if (ret == -EFAULT) + ret = mpx_resolve_fault(bd_entry_ptr, need_write); + /* + * If we could not resolve the fault, consider it + * userspace's fault and error out. + */ + if (ret) + return ret; + } + + valid_bit = bd_entry & MPX_BD_ENTRY_VALID_FLAG; + bt_addr = mpx_bd_entry_to_bt_addr(mm, bd_entry); + + /* + * When the kernel is managing bounds tables, a bounds directory + * entry will either have a valid address (plus the valid bit) + * *OR* be completely empty. If we see a !valid entry *and* some + * data in the address field, we know something is wrong. This + * -EINVAL return will cause a SIGSEGV. + */ + if (!valid_bit && bt_addr) + return -EINVAL; + /* + * Do we have an completely zeroed bt entry? That is OK. It + * just means there was no bounds table for this memory. Make + * sure to distinguish this from -EINVAL, which will cause + * a SEGV. + */ + if (!valid_bit) + return -ENOENT; + + *bt_addr_result = bt_addr; + return 0; +} + +static inline int bt_entry_size_bytes(struct mm_struct *mm) +{ + if (is_64bit_mm(mm)) + return MPX_BT_ENTRY_BYTES_64; + else + return MPX_BT_ENTRY_BYTES_32; +} + +/* + * Take a virtual address and turns it in to the offset in bytes + * inside of the bounds table where the bounds table entry + * controlling 'addr' can be found. + */ +static unsigned long mpx_get_bt_entry_offset_bytes(struct mm_struct *mm, + unsigned long addr) +{ + unsigned long bt_table_nr_entries; + unsigned long offset = addr; + + if (is_64bit_mm(mm)) { + /* Bottom 3 bits are ignored on 64-bit */ + offset >>= 3; + bt_table_nr_entries = MPX_BT_NR_ENTRIES_64; + } else { + /* Bottom 2 bits are ignored on 32-bit */ + offset >>= 2; + bt_table_nr_entries = MPX_BT_NR_ENTRIES_32; + } + /* + * We know the size of the table in to which we are + * indexing, and we have eliminated all the low bits + * which are ignored for indexing. + * + * Mask out all the high bits which we do not need + * to index in to the table. Note that the tables + * are always powers of two so this gives us a proper + * mask. + */ + offset &= (bt_table_nr_entries-1); + /* + * We now have an entry offset in terms of *entries* in + * the table. We need to scale it back up to bytes. + */ + offset *= bt_entry_size_bytes(mm); + return offset; +} + +/* + * How much virtual address space does a single bounds + * directory entry cover? + * + * Note, we need a long long because 4GB doesn't fit in + * to a long on 32-bit. + */ +static inline unsigned long bd_entry_virt_space(struct mm_struct *mm) +{ + unsigned long long virt_space; + unsigned long long GB = (1ULL << 30); + + /* + * This covers 32-bit emulation as well as 32-bit kernels + * running on 64-bit hardware. + */ + if (!is_64bit_mm(mm)) + return (4ULL * GB) / MPX_BD_NR_ENTRIES_32; + + /* + * 'x86_virt_bits' returns what the hardware is capable + * of, and returns the full >32-bit address space when + * running 32-bit kernels on 64-bit hardware. + */ + virt_space = (1ULL << boot_cpu_data.x86_virt_bits); + return virt_space / MPX_BD_NR_ENTRIES_64; +} + +/* + * Free the backing physical pages of bounds table 'bt_addr'. + * Assume start...end is within that bounds table. + */ +static noinline int zap_bt_entries_mapping(struct mm_struct *mm, + unsigned long bt_addr, + unsigned long start_mapping, unsigned long end_mapping) +{ + struct vm_area_struct *vma; + unsigned long addr, len; + unsigned long start; + unsigned long end; + + /* + * if we 'end' on a boundary, the offset will be 0 which + * is not what we want. Back it up a byte to get the + * last bt entry. Then once we have the entry itself, + * move 'end' back up by the table entry size. + */ + start = bt_addr + mpx_get_bt_entry_offset_bytes(mm, start_mapping); + end = bt_addr + mpx_get_bt_entry_offset_bytes(mm, end_mapping - 1); + /* + * Move end back up by one entry. Among other things + * this ensures that it remains page-aligned and does + * not screw up zap_page_range() + */ + end += bt_entry_size_bytes(mm); + + /* + * Find the first overlapping vma. If vma->vm_start > start, there + * will be a hole in the bounds table. This -EINVAL return will + * cause a SIGSEGV. + */ + vma = find_vma(mm, start); + if (!vma || vma->vm_start > start) + return -EINVAL; + + /* + * A NUMA policy on a VM_MPX VMA could cause this bounds table to + * be split. So we need to look across the entire 'start -> end' + * range of this bounds table, find all of the VM_MPX VMAs, and + * zap only those. + */ + addr = start; + while (vma && vma->vm_start < end) { + /* + * We followed a bounds directory entry down + * here. If we find a non-MPX VMA, that's bad, + * so stop immediately and return an error. This + * probably results in a SIGSEGV. + */ + if (!(vma->vm_flags & VM_MPX)) + return -EINVAL; + + len = min(vma->vm_end, end) - addr; + zap_page_range(vma, addr, len); + trace_mpx_unmap_zap(addr, addr+len); + + vma = vma->vm_next; + addr = vma->vm_start; + } + return 0; +} + +static unsigned long mpx_get_bd_entry_offset(struct mm_struct *mm, + unsigned long addr) +{ + /* + * There are several ways to derive the bd offsets. We + * use the following approach here: + * 1. We know the size of the virtual address space + * 2. We know the number of entries in a bounds table + * 3. We know that each entry covers a fixed amount of + * virtual address space. + * So, we can just divide the virtual address by the + * virtual space used by one entry to determine which + * entry "controls" the given virtual address. + */ + if (is_64bit_mm(mm)) { + int bd_entry_size = 8; /* 64-bit pointer */ + /* + * Take the 64-bit addressing hole in to account. + */ + addr &= ((1UL << boot_cpu_data.x86_virt_bits) - 1); + return (addr / bd_entry_virt_space(mm)) * bd_entry_size; + } else { + int bd_entry_size = 4; /* 32-bit pointer */ + /* + * 32-bit has no hole so this case needs no mask + */ + return (addr / bd_entry_virt_space(mm)) * bd_entry_size; + } + /* + * The two return calls above are exact copies. If we + * pull out a single copy and put it in here, gcc won't + * realize that we're doing a power-of-2 divide and use + * shifts. It uses a real divide. If we put them up + * there, it manages to figure it out (gcc 4.8.3). + */ +} + +static int unmap_entire_bt(struct mm_struct *mm, + long __user *bd_entry, unsigned long bt_addr) +{ + unsigned long expected_old_val = bt_addr | MPX_BD_ENTRY_VALID_FLAG; + unsigned long uninitialized_var(actual_old_val); + int ret; + + while (1) { + int need_write = 1; + unsigned long cleared_bd_entry = 0; + + pagefault_disable(); + ret = mpx_cmpxchg_bd_entry(mm, &actual_old_val, + bd_entry, expected_old_val, cleared_bd_entry); + pagefault_enable(); + if (!ret) + break; + if (ret == -EFAULT) + ret = mpx_resolve_fault(bd_entry, need_write); + /* + * If we could not resolve the fault, consider it + * userspace's fault and error out. + */ + if (ret) + return ret; + } + /* + * The cmpxchg was performed, check the results. + */ + if (actual_old_val != expected_old_val) { + /* + * Someone else raced with us to unmap the table. + * That is OK, since we were both trying to do + * the same thing. Declare success. + */ + if (!actual_old_val) + return 0; + /* + * Something messed with the bounds directory + * entry. We hold mmap_sem for read or write + * here, so it could not be a _new_ bounds table + * that someone just allocated. Something is + * wrong, so pass up the error and SIGSEGV. + */ + return -EINVAL; + } + /* + * Note, we are likely being called under do_munmap() already. To + * avoid recursion, do_munmap() will check whether it comes + * from one bounds table through VM_MPX flag. + */ + return do_munmap(mm, bt_addr, mpx_bt_size_bytes(mm), NULL); +} + +static int try_unmap_single_bt(struct mm_struct *mm, + unsigned long start, unsigned long end) +{ + struct vm_area_struct *next; + struct vm_area_struct *prev; + /* + * "bta" == Bounds Table Area: the area controlled by the + * bounds table that we are unmapping. + */ + unsigned long bta_start_vaddr = start & ~(bd_entry_virt_space(mm)-1); + unsigned long bta_end_vaddr = bta_start_vaddr + bd_entry_virt_space(mm); + unsigned long uninitialized_var(bt_addr); + void __user *bde_vaddr; + int ret; + /* + * We already unlinked the VMAs from the mm's rbtree so 'start' + * is guaranteed to be in a hole. This gets us the first VMA + * before the hole in to 'prev' and the next VMA after the hole + * in to 'next'. + */ + next = find_vma_prev(mm, start, &prev); + /* + * Do not count other MPX bounds table VMAs as neighbors. + * Although theoretically possible, we do not allow bounds + * tables for bounds tables so our heads do not explode. + * If we count them as neighbors here, we may end up with + * lots of tables even though we have no actual table + * entries in use. + */ + while (next && (next->vm_flags & VM_MPX)) + next = next->vm_next; + while (prev && (prev->vm_flags & VM_MPX)) + prev = prev->vm_prev; + /* + * We know 'start' and 'end' lie within an area controlled + * by a single bounds table. See if there are any other + * VMAs controlled by that bounds table. If there are not + * then we can "expand" the are we are unmapping to possibly + * cover the entire table. + */ + next = find_vma_prev(mm, start, &prev); + if ((!prev || prev->vm_end <= bta_start_vaddr) && + (!next || next->vm_start >= bta_end_vaddr)) { + /* + * No neighbor VMAs controlled by same bounds + * table. Try to unmap the whole thing + */ + start = bta_start_vaddr; + end = bta_end_vaddr; + } + + bde_vaddr = mm->context.bd_addr + mpx_get_bd_entry_offset(mm, start); + ret = get_bt_addr(mm, bde_vaddr, &bt_addr); + /* + * No bounds table there, so nothing to unmap. + */ + if (ret == -ENOENT) { + ret = 0; + return 0; + } + if (ret) + return ret; + /* + * We are unmapping an entire table. Either because the + * unmap that started this whole process was large enough + * to cover an entire table, or that the unmap was small + * but was the area covered by a bounds table. + */ + if ((start == bta_start_vaddr) && + (end == bta_end_vaddr)) + return unmap_entire_bt(mm, bde_vaddr, bt_addr); + return zap_bt_entries_mapping(mm, bt_addr, start, end); +} + +static int mpx_unmap_tables(struct mm_struct *mm, + unsigned long start, unsigned long end) +{ + unsigned long one_unmap_start; + trace_mpx_unmap_search(start, end); + + one_unmap_start = start; + while (one_unmap_start < end) { + int ret; + unsigned long next_unmap_start = ALIGN(one_unmap_start+1, + bd_entry_virt_space(mm)); + unsigned long one_unmap_end = end; + /* + * if the end is beyond the current bounds table, + * move it back so we only deal with a single one + * at a time + */ + if (one_unmap_end > next_unmap_start) + one_unmap_end = next_unmap_start; + ret = try_unmap_single_bt(mm, one_unmap_start, one_unmap_end); + if (ret) + return ret; + + one_unmap_start = next_unmap_start; + } + return 0; +} + +/* + * Free unused bounds tables covered in a virtual address region being + * munmap()ed. Assume end > start. + * + * This function will be called by do_munmap(), and the VMAs covering + * the virtual address region start...end have already been split if + * necessary, and the 'vma' is the first vma in this range (start -> end). + */ +void mpx_notify_unmap(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + int ret; + + /* + * Refuse to do anything unless userspace has asked + * the kernel to help manage the bounds tables, + */ + if (!kernel_managing_mpx_tables(current->mm)) + return; + /* + * This will look across the entire 'start -> end' range, + * and find all of the non-VM_MPX VMAs. + * + * To avoid recursion, if a VM_MPX vma is found in the range + * (start->end), we will not continue follow-up work. This + * recursion represents having bounds tables for bounds tables, + * which should not occur normally. Being strict about it here + * helps ensure that we do not have an exploitable stack overflow. + */ + do { + if (vma->vm_flags & VM_MPX) + return; + vma = vma->vm_next; + } while (vma && vma->vm_start < end); + + ret = mpx_unmap_tables(mm, start, end); + if (ret) + force_sig(SIGSEGV, current); +} + +/* MPX cannot handle addresses above 47 bits yet. */ +unsigned long mpx_unmapped_area_check(unsigned long addr, unsigned long len, + unsigned long flags) +{ + if (!kernel_managing_mpx_tables(current->mm)) + return addr; + if (addr + len <= DEFAULT_MAP_WINDOW) + return addr; + if (flags & MAP_FIXED) + return -ENOMEM; + + /* + * Requested len is larger than the whole area we're allowed to map in. + * Resetting hinting address wouldn't do much good -- fail early. + */ + if (len > DEFAULT_MAP_WINDOW) + return -ENOMEM; + + /* Look for unmap area within DEFAULT_MAP_WINDOW */ + return 0; +} diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c new file mode 100644 index 000000000..fa1508556 --- /dev/null +++ b/arch/x86/mm/numa.c @@ -0,0 +1,901 @@ +/* Common code for 32 and 64-bit NUMA */ +#include <linux/acpi.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/string.h> +#include <linux/init.h> +#include <linux/bootmem.h> +#include <linux/memblock.h> +#include <linux/mmzone.h> +#include <linux/ctype.h> +#include <linux/nodemask.h> +#include <linux/sched.h> +#include <linux/topology.h> + +#include <asm/e820/api.h> +#include <asm/proto.h> +#include <asm/dma.h> +#include <asm/amd_nb.h> + +#include "numa_internal.h" + +int numa_off; +nodemask_t numa_nodes_parsed __initdata; + +struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; +EXPORT_SYMBOL(node_data); + +static struct numa_meminfo numa_meminfo +#ifndef CONFIG_MEMORY_HOTPLUG +__initdata +#endif +; + +static int numa_distance_cnt; +static u8 *numa_distance; + +static __init int numa_setup(char *opt) +{ + if (!opt) + return -EINVAL; + if (!strncmp(opt, "off", 3)) + numa_off = 1; +#ifdef CONFIG_NUMA_EMU + if (!strncmp(opt, "fake=", 5)) + numa_emu_cmdline(opt + 5); +#endif +#ifdef CONFIG_ACPI_NUMA + if (!strncmp(opt, "noacpi", 6)) + acpi_numa = -1; +#endif + return 0; +} +early_param("numa", numa_setup); + +/* + * apicid, cpu, node mappings + */ +s16 __apicid_to_node[MAX_LOCAL_APIC] = { + [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE +}; + +int numa_cpu_node(int cpu) +{ + int apicid = early_per_cpu(x86_cpu_to_apicid, cpu); + + if (apicid != BAD_APICID) + return __apicid_to_node[apicid]; + return NUMA_NO_NODE; +} + +cpumask_var_t node_to_cpumask_map[MAX_NUMNODES]; +EXPORT_SYMBOL(node_to_cpumask_map); + +/* + * Map cpu index to node index + */ +DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE); +EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map); + +void numa_set_node(int cpu, int node) +{ + int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map); + + /* early setting, no percpu area yet */ + if (cpu_to_node_map) { + cpu_to_node_map[cpu] = node; + return; + } + +#ifdef CONFIG_DEBUG_PER_CPU_MAPS + if (cpu >= nr_cpu_ids || !cpu_possible(cpu)) { + printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu); + dump_stack(); + return; + } +#endif + per_cpu(x86_cpu_to_node_map, cpu) = node; + + set_cpu_numa_node(cpu, node); +} + +void numa_clear_node(int cpu) +{ + numa_set_node(cpu, NUMA_NO_NODE); +} + +/* + * Allocate node_to_cpumask_map based on number of available nodes + * Requires node_possible_map to be valid. + * + * Note: cpumask_of_node() is not valid until after this is done. + * (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.) + */ +void __init setup_node_to_cpumask_map(void) +{ + unsigned int node; + + /* setup nr_node_ids if not done yet */ + if (nr_node_ids == MAX_NUMNODES) + setup_nr_node_ids(); + + /* allocate the map */ + for (node = 0; node < nr_node_ids; node++) + alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]); + + /* cpumask_of_node() will now work */ + pr_debug("Node to cpumask map for %d nodes\n", nr_node_ids); +} + +static int __init numa_add_memblk_to(int nid, u64 start, u64 end, + struct numa_meminfo *mi) +{ + /* ignore zero length blks */ + if (start == end) + return 0; + + /* whine about and ignore invalid blks */ + if (start > end || nid < 0 || nid >= MAX_NUMNODES) { + pr_warn("Warning: invalid memblk node %d [mem %#010Lx-%#010Lx]\n", + nid, start, end - 1); + return 0; + } + + if (mi->nr_blks >= NR_NODE_MEMBLKS) { + pr_err("too many memblk ranges\n"); + return -EINVAL; + } + + mi->blk[mi->nr_blks].start = start; + mi->blk[mi->nr_blks].end = end; + mi->blk[mi->nr_blks].nid = nid; + mi->nr_blks++; + return 0; +} + +/** + * numa_remove_memblk_from - Remove one numa_memblk from a numa_meminfo + * @idx: Index of memblk to remove + * @mi: numa_meminfo to remove memblk from + * + * Remove @idx'th numa_memblk from @mi by shifting @mi->blk[] and + * decrementing @mi->nr_blks. + */ +void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi) +{ + mi->nr_blks--; + memmove(&mi->blk[idx], &mi->blk[idx + 1], + (mi->nr_blks - idx) * sizeof(mi->blk[0])); +} + +/** + * numa_add_memblk - Add one numa_memblk to numa_meminfo + * @nid: NUMA node ID of the new memblk + * @start: Start address of the new memblk + * @end: End address of the new memblk + * + * Add a new memblk to the default numa_meminfo. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +int __init numa_add_memblk(int nid, u64 start, u64 end) +{ + return numa_add_memblk_to(nid, start, end, &numa_meminfo); +} + +/* Allocate NODE_DATA for a node on the local memory */ +static void __init alloc_node_data(int nid) +{ + const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE); + u64 nd_pa; + void *nd; + int tnid; + + /* + * Allocate node data. Try node-local memory and then any node. + * Never allocate in DMA zone. + */ + nd_pa = memblock_alloc_nid(nd_size, SMP_CACHE_BYTES, nid); + if (!nd_pa) { + nd_pa = __memblock_alloc_base(nd_size, SMP_CACHE_BYTES, + MEMBLOCK_ALLOC_ACCESSIBLE); + if (!nd_pa) { + pr_err("Cannot find %zu bytes in any node (initial node: %d)\n", + nd_size, nid); + return; + } + } + nd = __va(nd_pa); + + /* report and initialize */ + printk(KERN_INFO "NODE_DATA(%d) allocated [mem %#010Lx-%#010Lx]\n", nid, + nd_pa, nd_pa + nd_size - 1); + tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT); + if (tnid != nid) + printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nid, tnid); + + node_data[nid] = nd; + memset(NODE_DATA(nid), 0, sizeof(pg_data_t)); + + node_set_online(nid); +} + +/** + * numa_cleanup_meminfo - Cleanup a numa_meminfo + * @mi: numa_meminfo to clean up + * + * Sanitize @mi by merging and removing unnecessary memblks. Also check for + * conflicts and clear unused memblks. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +int __init numa_cleanup_meminfo(struct numa_meminfo *mi) +{ + const u64 low = 0; + const u64 high = PFN_PHYS(max_pfn); + int i, j, k; + + /* first, trim all entries */ + for (i = 0; i < mi->nr_blks; i++) { + struct numa_memblk *bi = &mi->blk[i]; + + /* make sure all blocks are inside the limits */ + bi->start = max(bi->start, low); + bi->end = min(bi->end, high); + + /* and there's no empty or non-exist block */ + if (bi->start >= bi->end || + !memblock_overlaps_region(&memblock.memory, + bi->start, bi->end - bi->start)) + numa_remove_memblk_from(i--, mi); + } + + /* merge neighboring / overlapping entries */ + for (i = 0; i < mi->nr_blks; i++) { + struct numa_memblk *bi = &mi->blk[i]; + + for (j = i + 1; j < mi->nr_blks; j++) { + struct numa_memblk *bj = &mi->blk[j]; + u64 start, end; + + /* + * See whether there are overlapping blocks. Whine + * about but allow overlaps of the same nid. They + * will be merged below. + */ + if (bi->end > bj->start && bi->start < bj->end) { + if (bi->nid != bj->nid) { + pr_err("node %d [mem %#010Lx-%#010Lx] overlaps with node %d [mem %#010Lx-%#010Lx]\n", + bi->nid, bi->start, bi->end - 1, + bj->nid, bj->start, bj->end - 1); + return -EINVAL; + } + pr_warn("Warning: node %d [mem %#010Lx-%#010Lx] overlaps with itself [mem %#010Lx-%#010Lx]\n", + bi->nid, bi->start, bi->end - 1, + bj->start, bj->end - 1); + } + + /* + * Join together blocks on the same node, holes + * between which don't overlap with memory on other + * nodes. + */ + if (bi->nid != bj->nid) + continue; + start = min(bi->start, bj->start); + end = max(bi->end, bj->end); + for (k = 0; k < mi->nr_blks; k++) { + struct numa_memblk *bk = &mi->blk[k]; + + if (bi->nid == bk->nid) + continue; + if (start < bk->end && end > bk->start) + break; + } + if (k < mi->nr_blks) + continue; + printk(KERN_INFO "NUMA: Node %d [mem %#010Lx-%#010Lx] + [mem %#010Lx-%#010Lx] -> [mem %#010Lx-%#010Lx]\n", + bi->nid, bi->start, bi->end - 1, bj->start, + bj->end - 1, start, end - 1); + bi->start = start; + bi->end = end; + numa_remove_memblk_from(j--, mi); + } + } + + /* clear unused ones */ + for (i = mi->nr_blks; i < ARRAY_SIZE(mi->blk); i++) { + mi->blk[i].start = mi->blk[i].end = 0; + mi->blk[i].nid = NUMA_NO_NODE; + } + + return 0; +} + +/* + * Set nodes, which have memory in @mi, in *@nodemask. + */ +static void __init numa_nodemask_from_meminfo(nodemask_t *nodemask, + const struct numa_meminfo *mi) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(mi->blk); i++) + if (mi->blk[i].start != mi->blk[i].end && + mi->blk[i].nid != NUMA_NO_NODE) + node_set(mi->blk[i].nid, *nodemask); +} + +/** + * numa_reset_distance - Reset NUMA distance table + * + * The current table is freed. The next numa_set_distance() call will + * create a new one. + */ +void __init numa_reset_distance(void) +{ + size_t size = numa_distance_cnt * numa_distance_cnt * sizeof(numa_distance[0]); + + /* numa_distance could be 1LU marking allocation failure, test cnt */ + if (numa_distance_cnt) + memblock_free(__pa(numa_distance), size); + numa_distance_cnt = 0; + numa_distance = NULL; /* enable table creation */ +} + +static int __init numa_alloc_distance(void) +{ + nodemask_t nodes_parsed; + size_t size; + int i, j, cnt = 0; + u64 phys; + + /* size the new table and allocate it */ + nodes_parsed = numa_nodes_parsed; + numa_nodemask_from_meminfo(&nodes_parsed, &numa_meminfo); + + for_each_node_mask(i, nodes_parsed) + cnt = i; + cnt++; + size = cnt * cnt * sizeof(numa_distance[0]); + + phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), + size, PAGE_SIZE); + if (!phys) { + pr_warn("Warning: can't allocate distance table!\n"); + /* don't retry until explicitly reset */ + numa_distance = (void *)1LU; + return -ENOMEM; + } + memblock_reserve(phys, size); + + numa_distance = __va(phys); + numa_distance_cnt = cnt; + + /* fill with the default distances */ + for (i = 0; i < cnt; i++) + for (j = 0; j < cnt; j++) + numa_distance[i * cnt + j] = i == j ? + LOCAL_DISTANCE : REMOTE_DISTANCE; + printk(KERN_DEBUG "NUMA: Initialized distance table, cnt=%d\n", cnt); + + return 0; +} + +/** + * numa_set_distance - Set NUMA distance from one NUMA to another + * @from: the 'from' node to set distance + * @to: the 'to' node to set distance + * @distance: NUMA distance + * + * Set the distance from node @from to @to to @distance. If distance table + * doesn't exist, one which is large enough to accommodate all the currently + * known nodes will be created. + * + * If such table cannot be allocated, a warning is printed and further + * calls are ignored until the distance table is reset with + * numa_reset_distance(). + * + * If @from or @to is higher than the highest known node or lower than zero + * at the time of table creation or @distance doesn't make sense, the call + * is ignored. + * This is to allow simplification of specific NUMA config implementations. + */ +void __init numa_set_distance(int from, int to, int distance) +{ + if (!numa_distance && numa_alloc_distance() < 0) + return; + + if (from >= numa_distance_cnt || to >= numa_distance_cnt || + from < 0 || to < 0) { + pr_warn_once("Warning: node ids are out of bound, from=%d to=%d distance=%d\n", + from, to, distance); + return; + } + + if ((u8)distance != distance || + (from == to && distance != LOCAL_DISTANCE)) { + pr_warn_once("Warning: invalid distance parameter, from=%d to=%d distance=%d\n", + from, to, distance); + return; + } + + numa_distance[from * numa_distance_cnt + to] = distance; +} + +int __node_distance(int from, int to) +{ + if (from >= numa_distance_cnt || to >= numa_distance_cnt) + return from == to ? LOCAL_DISTANCE : REMOTE_DISTANCE; + return numa_distance[from * numa_distance_cnt + to]; +} +EXPORT_SYMBOL(__node_distance); + +/* + * Sanity check to catch more bad NUMA configurations (they are amazingly + * common). Make sure the nodes cover all memory. + */ +static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi) +{ + u64 numaram, e820ram; + int i; + + numaram = 0; + for (i = 0; i < mi->nr_blks; i++) { + u64 s = mi->blk[i].start >> PAGE_SHIFT; + u64 e = mi->blk[i].end >> PAGE_SHIFT; + numaram += e - s; + numaram -= __absent_pages_in_range(mi->blk[i].nid, s, e); + if ((s64)numaram < 0) + numaram = 0; + } + + e820ram = max_pfn - absent_pages_in_range(0, max_pfn); + + /* We seem to lose 3 pages somewhere. Allow 1M of slack. */ + if ((s64)(e820ram - numaram) >= (1 << (20 - PAGE_SHIFT))) { + printk(KERN_ERR "NUMA: nodes only cover %LuMB of your %LuMB e820 RAM. Not used.\n", + (numaram << PAGE_SHIFT) >> 20, + (e820ram << PAGE_SHIFT) >> 20); + return false; + } + return true; +} + +/* + * Mark all currently memblock-reserved physical memory (which covers the + * kernel's own memory ranges) as hot-unswappable. + */ +static void __init numa_clear_kernel_node_hotplug(void) +{ + nodemask_t reserved_nodemask = NODE_MASK_NONE; + struct memblock_region *mb_region; + int i; + + /* + * We have to do some preprocessing of memblock regions, to + * make them suitable for reservation. + * + * At this time, all memory regions reserved by memblock are + * used by the kernel, but those regions are not split up + * along node boundaries yet, and don't necessarily have their + * node ID set yet either. + * + * So iterate over all memory known to the x86 architecture, + * and use those ranges to set the nid in memblock.reserved. + * This will split up the memblock regions along node + * boundaries and will set the node IDs as well. + */ + for (i = 0; i < numa_meminfo.nr_blks; i++) { + struct numa_memblk *mb = numa_meminfo.blk + i; + int ret; + + ret = memblock_set_node(mb->start, mb->end - mb->start, &memblock.reserved, mb->nid); + WARN_ON_ONCE(ret); + } + + /* + * Now go over all reserved memblock regions, to construct a + * node mask of all kernel reserved memory areas. + * + * [ Note, when booting with mem=nn[kMG] or in a kdump kernel, + * numa_meminfo might not include all memblock.reserved + * memory ranges, because quirks such as trim_snb_memory() + * reserve specific pages for Sandy Bridge graphics. ] + */ + for_each_memblock(reserved, mb_region) { + if (mb_region->nid != MAX_NUMNODES) + node_set(mb_region->nid, reserved_nodemask); + } + + /* + * Finally, clear the MEMBLOCK_HOTPLUG flag for all memory + * belonging to the reserved node mask. + * + * Note that this will include memory regions that reside + * on nodes that contain kernel memory - entire nodes + * become hot-unpluggable: + */ + for (i = 0; i < numa_meminfo.nr_blks; i++) { + struct numa_memblk *mb = numa_meminfo.blk + i; + + if (!node_isset(mb->nid, reserved_nodemask)) + continue; + + memblock_clear_hotplug(mb->start, mb->end - mb->start); + } +} + +static int __init numa_register_memblks(struct numa_meminfo *mi) +{ + unsigned long uninitialized_var(pfn_align); + int i, nid; + + /* Account for nodes with cpus and no memory */ + node_possible_map = numa_nodes_parsed; + numa_nodemask_from_meminfo(&node_possible_map, mi); + if (WARN_ON(nodes_empty(node_possible_map))) + return -EINVAL; + + for (i = 0; i < mi->nr_blks; i++) { + struct numa_memblk *mb = &mi->blk[i]; + memblock_set_node(mb->start, mb->end - mb->start, + &memblock.memory, mb->nid); + } + + /* + * At very early time, the kernel have to use some memory such as + * loading the kernel image. We cannot prevent this anyway. So any + * node the kernel resides in should be un-hotpluggable. + * + * And when we come here, alloc node data won't fail. + */ + numa_clear_kernel_node_hotplug(); + + /* + * If sections array is gonna be used for pfn -> nid mapping, check + * whether its granularity is fine enough. + */ +#ifdef NODE_NOT_IN_PAGE_FLAGS + pfn_align = node_map_pfn_alignment(); + if (pfn_align && pfn_align < PAGES_PER_SECTION) { + printk(KERN_WARNING "Node alignment %LuMB < min %LuMB, rejecting NUMA config\n", + PFN_PHYS(pfn_align) >> 20, + PFN_PHYS(PAGES_PER_SECTION) >> 20); + return -EINVAL; + } +#endif + if (!numa_meminfo_cover_memory(mi)) + return -EINVAL; + + /* Finally register nodes. */ + for_each_node_mask(nid, node_possible_map) { + u64 start = PFN_PHYS(max_pfn); + u64 end = 0; + + for (i = 0; i < mi->nr_blks; i++) { + if (nid != mi->blk[i].nid) + continue; + start = min(mi->blk[i].start, start); + end = max(mi->blk[i].end, end); + } + + if (start >= end) + continue; + + /* + * Don't confuse VM with a node that doesn't have the + * minimum amount of memory: + */ + if (end && (end - start) < NODE_MIN_SIZE) + continue; + + alloc_node_data(nid); + } + + /* Dump memblock with node info and return. */ + memblock_dump_all(); + return 0; +} + +/* + * There are unfortunately some poorly designed mainboards around that + * only connect memory to a single CPU. This breaks the 1:1 cpu->node + * mapping. To avoid this fill in the mapping for all possible CPUs, + * as the number of CPUs is not known yet. We round robin the existing + * nodes. + */ +static void __init numa_init_array(void) +{ + int rr, i; + + rr = first_node(node_online_map); + for (i = 0; i < nr_cpu_ids; i++) { + if (early_cpu_to_node(i) != NUMA_NO_NODE) + continue; + numa_set_node(i, rr); + rr = next_node_in(rr, node_online_map); + } +} + +static int __init numa_init(int (*init_func)(void)) +{ + int i; + int ret; + + for (i = 0; i < MAX_LOCAL_APIC; i++) + set_apicid_to_node(i, NUMA_NO_NODE); + + nodes_clear(numa_nodes_parsed); + nodes_clear(node_possible_map); + nodes_clear(node_online_map); + memset(&numa_meminfo, 0, sizeof(numa_meminfo)); + WARN_ON(memblock_set_node(0, ULLONG_MAX, &memblock.memory, + MAX_NUMNODES)); + WARN_ON(memblock_set_node(0, ULLONG_MAX, &memblock.reserved, + MAX_NUMNODES)); + /* In case that parsing SRAT failed. */ + WARN_ON(memblock_clear_hotplug(0, ULLONG_MAX)); + numa_reset_distance(); + + ret = init_func(); + if (ret < 0) + return ret; + + /* + * We reset memblock back to the top-down direction + * here because if we configured ACPI_NUMA, we have + * parsed SRAT in init_func(). It is ok to have the + * reset here even if we did't configure ACPI_NUMA + * or acpi numa init fails and fallbacks to dummy + * numa init. + */ + memblock_set_bottom_up(false); + + ret = numa_cleanup_meminfo(&numa_meminfo); + if (ret < 0) + return ret; + + numa_emulation(&numa_meminfo, numa_distance_cnt); + + ret = numa_register_memblks(&numa_meminfo); + if (ret < 0) + return ret; + + for (i = 0; i < nr_cpu_ids; i++) { + int nid = early_cpu_to_node(i); + + if (nid == NUMA_NO_NODE) + continue; + if (!node_online(nid)) + numa_clear_node(i); + } + numa_init_array(); + + return 0; +} + +/** + * dummy_numa_init - Fallback dummy NUMA init + * + * Used if there's no underlying NUMA architecture, NUMA initialization + * fails, or NUMA is disabled on the command line. + * + * Must online at least one node and add memory blocks that cover all + * allowed memory. This function must not fail. + */ +static int __init dummy_numa_init(void) +{ + printk(KERN_INFO "%s\n", + numa_off ? "NUMA turned off" : "No NUMA configuration found"); + printk(KERN_INFO "Faking a node at [mem %#018Lx-%#018Lx]\n", + 0LLU, PFN_PHYS(max_pfn) - 1); + + node_set(0, numa_nodes_parsed); + numa_add_memblk(0, 0, PFN_PHYS(max_pfn)); + + return 0; +} + +/** + * x86_numa_init - Initialize NUMA + * + * Try each configured NUMA initialization method until one succeeds. The + * last fallback is dummy single node config encomapssing whole memory and + * never fails. + */ +void __init x86_numa_init(void) +{ + if (!numa_off) { +#ifdef CONFIG_ACPI_NUMA + if (!numa_init(x86_acpi_numa_init)) + return; +#endif +#ifdef CONFIG_AMD_NUMA + if (!numa_init(amd_numa_init)) + return; +#endif + } + + numa_init(dummy_numa_init); +} + +static void __init init_memory_less_node(int nid) +{ + unsigned long zones_size[MAX_NR_ZONES] = {0}; + unsigned long zholes_size[MAX_NR_ZONES] = {0}; + + /* Allocate and initialize node data. Memory-less node is now online.*/ + alloc_node_data(nid); + free_area_init_node(nid, zones_size, 0, zholes_size); + + /* + * All zonelists will be built later in start_kernel() after per cpu + * areas are initialized. + */ +} + +/* + * Setup early cpu_to_node. + * + * Populate cpu_to_node[] only if x86_cpu_to_apicid[], + * and apicid_to_node[] tables have valid entries for a CPU. + * This means we skip cpu_to_node[] initialisation for NUMA + * emulation and faking node case (when running a kernel compiled + * for NUMA on a non NUMA box), which is OK as cpu_to_node[] + * is already initialized in a round robin manner at numa_init_array, + * prior to this call, and this initialization is good enough + * for the fake NUMA cases. + * + * Called before the per_cpu areas are setup. + */ +void __init init_cpu_to_node(void) +{ + int cpu; + u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid); + + BUG_ON(cpu_to_apicid == NULL); + + for_each_possible_cpu(cpu) { + int node = numa_cpu_node(cpu); + + if (node == NUMA_NO_NODE) + continue; + + if (!node_online(node)) + init_memory_less_node(node); + + numa_set_node(cpu, node); + } +} + +#ifndef CONFIG_DEBUG_PER_CPU_MAPS + +# ifndef CONFIG_NUMA_EMU +void numa_add_cpu(int cpu) +{ + cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); +} + +void numa_remove_cpu(int cpu) +{ + cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); +} +# endif /* !CONFIG_NUMA_EMU */ + +#else /* !CONFIG_DEBUG_PER_CPU_MAPS */ + +int __cpu_to_node(int cpu) +{ + if (early_per_cpu_ptr(x86_cpu_to_node_map)) { + printk(KERN_WARNING + "cpu_to_node(%d): usage too early!\n", cpu); + dump_stack(); + return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; + } + return per_cpu(x86_cpu_to_node_map, cpu); +} +EXPORT_SYMBOL(__cpu_to_node); + +/* + * Same function as cpu_to_node() but used if called before the + * per_cpu areas are setup. + */ +int early_cpu_to_node(int cpu) +{ + if (early_per_cpu_ptr(x86_cpu_to_node_map)) + return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; + + if (!cpu_possible(cpu)) { + printk(KERN_WARNING + "early_cpu_to_node(%d): no per_cpu area!\n", cpu); + dump_stack(); + return NUMA_NO_NODE; + } + return per_cpu(x86_cpu_to_node_map, cpu); +} + +void debug_cpumask_set_cpu(int cpu, int node, bool enable) +{ + struct cpumask *mask; + + if (node == NUMA_NO_NODE) { + /* early_cpu_to_node() already emits a warning and trace */ + return; + } + mask = node_to_cpumask_map[node]; + if (!mask) { + pr_err("node_to_cpumask_map[%i] NULL\n", node); + dump_stack(); + return; + } + + if (enable) + cpumask_set_cpu(cpu, mask); + else + cpumask_clear_cpu(cpu, mask); + + printk(KERN_DEBUG "%s cpu %d node %d: mask now %*pbl\n", + enable ? "numa_add_cpu" : "numa_remove_cpu", + cpu, node, cpumask_pr_args(mask)); + return; +} + +# ifndef CONFIG_NUMA_EMU +static void numa_set_cpumask(int cpu, bool enable) +{ + debug_cpumask_set_cpu(cpu, early_cpu_to_node(cpu), enable); +} + +void numa_add_cpu(int cpu) +{ + numa_set_cpumask(cpu, true); +} + +void numa_remove_cpu(int cpu) +{ + numa_set_cpumask(cpu, false); +} +# endif /* !CONFIG_NUMA_EMU */ + +/* + * Returns a pointer to the bitmask of CPUs on Node 'node'. + */ +const struct cpumask *cpumask_of_node(int node) +{ + if (node >= nr_node_ids) { + printk(KERN_WARNING + "cpumask_of_node(%d): node > nr_node_ids(%d)\n", + node, nr_node_ids); + dump_stack(); + return cpu_none_mask; + } + if (node_to_cpumask_map[node] == NULL) { + printk(KERN_WARNING + "cpumask_of_node(%d): no node_to_cpumask_map!\n", + node); + dump_stack(); + return cpu_online_mask; + } + return node_to_cpumask_map[node]; +} +EXPORT_SYMBOL(cpumask_of_node); + +#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ + +#ifdef CONFIG_MEMORY_HOTPLUG +int memory_add_physaddr_to_nid(u64 start) +{ + struct numa_meminfo *mi = &numa_meminfo; + int nid = mi->blk[0].nid; + int i; + + for (i = 0; i < mi->nr_blks; i++) + if (mi->blk[i].start <= start && mi->blk[i].end > start) + nid = mi->blk[i].nid; + return nid; +} +EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); +#endif diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c new file mode 100644 index 000000000..e8a4a09e2 --- /dev/null +++ b/arch/x86/mm/numa_32.c @@ -0,0 +1,94 @@ +/* + * Written by: Patricia Gaughen <gone@us.ibm.com>, IBM Corporation + * August 2002: added remote node KVA remap - Martin J. Bligh + * + * Copyright (C) 2002, IBM Corp. + * + * All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include <linux/bootmem.h> +#include <linux/memblock.h> +#include <linux/init.h> + +#include "numa_internal.h" + +#ifdef CONFIG_DISCONTIGMEM +/* + * 4) physnode_map - the mapping between a pfn and owning node + * physnode_map keeps track of the physical memory layout of a generic + * numa node on a 64Mb break (each element of the array will + * represent 64Mb of memory and will be marked by the node id. so, + * if the first gig is on node 0, and the second gig is on node 1 + * physnode_map will contain: + * + * physnode_map[0-15] = 0; + * physnode_map[16-31] = 1; + * physnode_map[32- ] = -1; + */ +s8 physnode_map[MAX_SECTIONS] __read_mostly = { [0 ... (MAX_SECTIONS - 1)] = -1}; +EXPORT_SYMBOL(physnode_map); + +void memory_present(int nid, unsigned long start, unsigned long end) +{ + unsigned long pfn; + + printk(KERN_INFO "Node: %d, start_pfn: %lx, end_pfn: %lx\n", + nid, start, end); + printk(KERN_DEBUG " Setting physnode_map array to node %d for pfns:\n", nid); + printk(KERN_DEBUG " "); + start = round_down(start, PAGES_PER_SECTION); + end = round_up(end, PAGES_PER_SECTION); + for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) { + physnode_map[pfn / PAGES_PER_SECTION] = nid; + printk(KERN_CONT "%lx ", pfn); + } + printk(KERN_CONT "\n"); +} +#endif + +extern unsigned long highend_pfn, highstart_pfn; + +void __init initmem_init(void) +{ + x86_numa_init(); + +#ifdef CONFIG_HIGHMEM + highstart_pfn = highend_pfn = max_pfn; + if (max_pfn > max_low_pfn) + highstart_pfn = max_low_pfn; + printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", + pages_to_mb(highend_pfn - highstart_pfn)); + high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1; +#else + high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1; +#endif + printk(KERN_NOTICE "%ldMB LOWMEM available.\n", + pages_to_mb(max_low_pfn)); + printk(KERN_DEBUG "max_low_pfn = %lx, highstart_pfn = %lx\n", + max_low_pfn, highstart_pfn); + + printk(KERN_DEBUG "Low memory ends at vaddr %08lx\n", + (ulong) pfn_to_kaddr(max_low_pfn)); + + printk(KERN_DEBUG "High memory starts at vaddr %08lx\n", + (ulong) pfn_to_kaddr(highstart_pfn)); + + __vmalloc_start_set = true; + setup_bootmem_allocator(); +} diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c new file mode 100644 index 000000000..066f3511d --- /dev/null +++ b/arch/x86/mm/numa_64.c @@ -0,0 +1,13 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Generic VM initialization for x86-64 NUMA setups. + * Copyright 2002,2003 Andi Kleen, SuSE Labs. + */ +#include <linux/bootmem.h> + +#include "numa_internal.h" + +void __init initmem_init(void) +{ + x86_numa_init(); +} diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c new file mode 100644 index 000000000..4686757a7 --- /dev/null +++ b/arch/x86/mm/numa_emulation.c @@ -0,0 +1,587 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * NUMA emulation + */ +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/topology.h> +#include <linux/memblock.h> +#include <linux/bootmem.h> +#include <asm/dma.h> + +#include "numa_internal.h" + +static int emu_nid_to_phys[MAX_NUMNODES]; +static char *emu_cmdline __initdata; + +void __init numa_emu_cmdline(char *str) +{ + emu_cmdline = str; +} + +static int __init emu_find_memblk_by_nid(int nid, const struct numa_meminfo *mi) +{ + int i; + + for (i = 0; i < mi->nr_blks; i++) + if (mi->blk[i].nid == nid) + return i; + return -ENOENT; +} + +static u64 __init mem_hole_size(u64 start, u64 end) +{ + unsigned long start_pfn = PFN_UP(start); + unsigned long end_pfn = PFN_DOWN(end); + + if (start_pfn < end_pfn) + return PFN_PHYS(absent_pages_in_range(start_pfn, end_pfn)); + return 0; +} + +/* + * Sets up nid to range from @start to @end. The return value is -errno if + * something went wrong, 0 otherwise. + */ +static int __init emu_setup_memblk(struct numa_meminfo *ei, + struct numa_meminfo *pi, + int nid, int phys_blk, u64 size) +{ + struct numa_memblk *eb = &ei->blk[ei->nr_blks]; + struct numa_memblk *pb = &pi->blk[phys_blk]; + + if (ei->nr_blks >= NR_NODE_MEMBLKS) { + pr_err("NUMA: Too many emulated memblks, failing emulation\n"); + return -EINVAL; + } + + ei->nr_blks++; + eb->start = pb->start; + eb->end = pb->start + size; + eb->nid = nid; + + if (emu_nid_to_phys[nid] == NUMA_NO_NODE) + emu_nid_to_phys[nid] = pb->nid; + + pb->start += size; + if (pb->start >= pb->end) { + WARN_ON_ONCE(pb->start > pb->end); + numa_remove_memblk_from(phys_blk, pi); + } + + printk(KERN_INFO "Faking node %d at [mem %#018Lx-%#018Lx] (%LuMB)\n", + nid, eb->start, eb->end - 1, (eb->end - eb->start) >> 20); + return 0; +} + +/* + * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr + * to max_addr. + * + * Returns zero on success or negative on error. + */ +static int __init split_nodes_interleave(struct numa_meminfo *ei, + struct numa_meminfo *pi, + u64 addr, u64 max_addr, int nr_nodes) +{ + nodemask_t physnode_mask = numa_nodes_parsed; + u64 size; + int big; + int nid = 0; + int i, ret; + + if (nr_nodes <= 0) + return -1; + if (nr_nodes > MAX_NUMNODES) { + pr_info("numa=fake=%d too large, reducing to %d\n", + nr_nodes, MAX_NUMNODES); + nr_nodes = MAX_NUMNODES; + } + + /* + * Calculate target node size. x86_32 freaks on __udivdi3() so do + * the division in ulong number of pages and convert back. + */ + size = max_addr - addr - mem_hole_size(addr, max_addr); + size = PFN_PHYS((unsigned long)(size >> PAGE_SHIFT) / nr_nodes); + + /* + * Calculate the number of big nodes that can be allocated as a result + * of consolidating the remainder. + */ + big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) / + FAKE_NODE_MIN_SIZE; + + size &= FAKE_NODE_MIN_HASH_MASK; + if (!size) { + pr_err("Not enough memory for each node. " + "NUMA emulation disabled.\n"); + return -1; + } + + /* + * Continue to fill physical nodes with fake nodes until there is no + * memory left on any of them. + */ + while (nodes_weight(physnode_mask)) { + for_each_node_mask(i, physnode_mask) { + u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN); + u64 start, limit, end; + int phys_blk; + + phys_blk = emu_find_memblk_by_nid(i, pi); + if (phys_blk < 0) { + node_clear(i, physnode_mask); + continue; + } + start = pi->blk[phys_blk].start; + limit = pi->blk[phys_blk].end; + end = start + size; + + if (nid < big) + end += FAKE_NODE_MIN_SIZE; + + /* + * Continue to add memory to this fake node if its + * non-reserved memory is less than the per-node size. + */ + while (end - start - mem_hole_size(start, end) < size) { + end += FAKE_NODE_MIN_SIZE; + if (end > limit) { + end = limit; + break; + } + } + + /* + * If there won't be at least FAKE_NODE_MIN_SIZE of + * non-reserved memory in ZONE_DMA32 for the next node, + * this one must extend to the boundary. + */ + if (end < dma32_end && dma32_end - end - + mem_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) + end = dma32_end; + + /* + * If there won't be enough non-reserved memory for the + * next node, this one must extend to the end of the + * physical node. + */ + if (limit - end - mem_hole_size(end, limit) < size) + end = limit; + + ret = emu_setup_memblk(ei, pi, nid++ % nr_nodes, + phys_blk, + min(end, limit) - start); + if (ret < 0) + return ret; + } + } + return 0; +} + +/* + * Returns the end address of a node so that there is at least `size' amount of + * non-reserved memory or `max_addr' is reached. + */ +static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size) +{ + u64 end = start + size; + + while (end - start - mem_hole_size(start, end) < size) { + end += FAKE_NODE_MIN_SIZE; + if (end > max_addr) { + end = max_addr; + break; + } + } + return end; +} + +static u64 uniform_size(u64 max_addr, u64 base, u64 hole, int nr_nodes) +{ + unsigned long max_pfn = PHYS_PFN(max_addr); + unsigned long base_pfn = PHYS_PFN(base); + unsigned long hole_pfns = PHYS_PFN(hole); + + return PFN_PHYS((max_pfn - base_pfn - hole_pfns) / nr_nodes); +} + +/* + * Sets up fake nodes of `size' interleaved over physical nodes ranging from + * `addr' to `max_addr'. + * + * Returns zero on success or negative on error. + */ +static int __init split_nodes_size_interleave_uniform(struct numa_meminfo *ei, + struct numa_meminfo *pi, + u64 addr, u64 max_addr, u64 size, + int nr_nodes, struct numa_memblk *pblk, + int nid) +{ + nodemask_t physnode_mask = numa_nodes_parsed; + int i, ret, uniform = 0; + u64 min_size; + + if ((!size && !nr_nodes) || (nr_nodes && !pblk)) + return -1; + + /* + * In the 'uniform' case split the passed in physical node by + * nr_nodes, in the non-uniform case, ignore the passed in + * physical block and try to create nodes of at least size + * @size. + * + * In the uniform case, split the nodes strictly by physical + * capacity, i.e. ignore holes. In the non-uniform case account + * for holes and treat @size as a minimum floor. + */ + if (!nr_nodes) + nr_nodes = MAX_NUMNODES; + else { + nodes_clear(physnode_mask); + node_set(pblk->nid, physnode_mask); + uniform = 1; + } + + if (uniform) { + min_size = uniform_size(max_addr, addr, 0, nr_nodes); + size = min_size; + } else { + /* + * The limit on emulated nodes is MAX_NUMNODES, so the + * size per node is increased accordingly if the + * requested size is too small. This creates a uniform + * distribution of node sizes across the entire machine + * (but not necessarily over physical nodes). + */ + min_size = uniform_size(max_addr, addr, + mem_hole_size(addr, max_addr), nr_nodes); + } + min_size = ALIGN(max(min_size, FAKE_NODE_MIN_SIZE), FAKE_NODE_MIN_SIZE); + if (size < min_size) { + pr_err("Fake node size %LuMB too small, increasing to %LuMB\n", + size >> 20, min_size >> 20); + size = min_size; + } + size = ALIGN_DOWN(size, FAKE_NODE_MIN_SIZE); + + /* + * Fill physical nodes with fake nodes of size until there is no memory + * left on any of them. + */ + while (nodes_weight(physnode_mask)) { + for_each_node_mask(i, physnode_mask) { + u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN); + u64 start, limit, end; + int phys_blk; + + phys_blk = emu_find_memblk_by_nid(i, pi); + if (phys_blk < 0) { + node_clear(i, physnode_mask); + continue; + } + + start = pi->blk[phys_blk].start; + limit = pi->blk[phys_blk].end; + + if (uniform) + end = start + size; + else + end = find_end_of_node(start, limit, size); + /* + * If there won't be at least FAKE_NODE_MIN_SIZE of + * non-reserved memory in ZONE_DMA32 for the next node, + * this one must extend to the boundary. + */ + if (end < dma32_end && dma32_end - end - + mem_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) + end = dma32_end; + + /* + * If there won't be enough non-reserved memory for the + * next node, this one must extend to the end of the + * physical node. + */ + if ((limit - end - mem_hole_size(end, limit) < size) + && !uniform) + end = limit; + + ret = emu_setup_memblk(ei, pi, nid++ % MAX_NUMNODES, + phys_blk, + min(end, limit) - start); + if (ret < 0) + return ret; + } + } + return nid; +} + +static int __init split_nodes_size_interleave(struct numa_meminfo *ei, + struct numa_meminfo *pi, + u64 addr, u64 max_addr, u64 size) +{ + return split_nodes_size_interleave_uniform(ei, pi, addr, max_addr, size, + 0, NULL, 0); +} + +int __init setup_emu2phys_nid(int *dfl_phys_nid) +{ + int i, max_emu_nid = 0; + + *dfl_phys_nid = NUMA_NO_NODE; + for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) { + if (emu_nid_to_phys[i] != NUMA_NO_NODE) { + max_emu_nid = i; + if (*dfl_phys_nid == NUMA_NO_NODE) + *dfl_phys_nid = emu_nid_to_phys[i]; + } + } + + return max_emu_nid; +} + +/** + * numa_emulation - Emulate NUMA nodes + * @numa_meminfo: NUMA configuration to massage + * @numa_dist_cnt: The size of the physical NUMA distance table + * + * Emulate NUMA nodes according to the numa=fake kernel parameter. + * @numa_meminfo contains the physical memory configuration and is modified + * to reflect the emulated configuration on success. @numa_dist_cnt is + * used to determine the size of the physical distance table. + * + * On success, the following modifications are made. + * + * - @numa_meminfo is updated to reflect the emulated nodes. + * + * - __apicid_to_node[] is updated such that APIC IDs are mapped to the + * emulated nodes. + * + * - NUMA distance table is rebuilt to represent distances between emulated + * nodes. The distances are determined considering how emulated nodes + * are mapped to physical nodes and match the actual distances. + * + * - emu_nid_to_phys[] reflects how emulated nodes are mapped to physical + * nodes. This is used by numa_add_cpu() and numa_remove_cpu(). + * + * If emulation is not enabled or fails, emu_nid_to_phys[] is filled with + * identity mapping and no other modification is made. + */ +void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) +{ + static struct numa_meminfo ei __initdata; + static struct numa_meminfo pi __initdata; + const u64 max_addr = PFN_PHYS(max_pfn); + u8 *phys_dist = NULL; + size_t phys_size = numa_dist_cnt * numa_dist_cnt * sizeof(phys_dist[0]); + int max_emu_nid, dfl_phys_nid; + int i, j, ret; + + if (!emu_cmdline) + goto no_emu; + + memset(&ei, 0, sizeof(ei)); + pi = *numa_meminfo; + + for (i = 0; i < MAX_NUMNODES; i++) + emu_nid_to_phys[i] = NUMA_NO_NODE; + + /* + * If the numa=fake command-line contains a 'M' or 'G', it represents + * the fixed node size. Otherwise, if it is just a single number N, + * split the system RAM into N fake nodes. + */ + if (strchr(emu_cmdline, 'U')) { + nodemask_t physnode_mask = numa_nodes_parsed; + unsigned long n; + int nid = 0; + + n = simple_strtoul(emu_cmdline, &emu_cmdline, 0); + ret = -1; + for_each_node_mask(i, physnode_mask) { + /* + * The reason we pass in blk[0] is due to + * numa_remove_memblk_from() called by + * emu_setup_memblk() will delete entry 0 + * and then move everything else up in the pi.blk + * array. Therefore we should always be looking + * at blk[0]. + */ + ret = split_nodes_size_interleave_uniform(&ei, &pi, + pi.blk[0].start, pi.blk[0].end, 0, + n, &pi.blk[0], nid); + if (ret < 0) + break; + if (ret < n) { + pr_info("%s: phys: %d only got %d of %ld nodes, failing\n", + __func__, i, ret, n); + ret = -1; + break; + } + nid = ret; + } + } else if (strchr(emu_cmdline, 'M') || strchr(emu_cmdline, 'G')) { + u64 size; + + size = memparse(emu_cmdline, &emu_cmdline); + ret = split_nodes_size_interleave(&ei, &pi, 0, max_addr, size); + } else { + unsigned long n; + + n = simple_strtoul(emu_cmdline, &emu_cmdline, 0); + ret = split_nodes_interleave(&ei, &pi, 0, max_addr, n); + } + if (*emu_cmdline == ':') + emu_cmdline++; + + if (ret < 0) + goto no_emu; + + if (numa_cleanup_meminfo(&ei) < 0) { + pr_warning("NUMA: Warning: constructed meminfo invalid, disabling emulation\n"); + goto no_emu; + } + + /* copy the physical distance table */ + if (numa_dist_cnt) { + u64 phys; + + phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), + phys_size, PAGE_SIZE); + if (!phys) { + pr_warning("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n"); + goto no_emu; + } + memblock_reserve(phys, phys_size); + phys_dist = __va(phys); + + for (i = 0; i < numa_dist_cnt; i++) + for (j = 0; j < numa_dist_cnt; j++) + phys_dist[i * numa_dist_cnt + j] = + node_distance(i, j); + } + + /* + * Determine the max emulated nid and the default phys nid to use + * for unmapped nodes. + */ + max_emu_nid = setup_emu2phys_nid(&dfl_phys_nid); + + /* commit */ + *numa_meminfo = ei; + + /* Make sure numa_nodes_parsed only contains emulated nodes */ + nodes_clear(numa_nodes_parsed); + for (i = 0; i < ARRAY_SIZE(ei.blk); i++) + if (ei.blk[i].start != ei.blk[i].end && + ei.blk[i].nid != NUMA_NO_NODE) + node_set(ei.blk[i].nid, numa_nodes_parsed); + + /* + * Transform __apicid_to_node table to use emulated nids by + * reverse-mapping phys_nid. The maps should always exist but fall + * back to zero just in case. + */ + for (i = 0; i < ARRAY_SIZE(__apicid_to_node); i++) { + if (__apicid_to_node[i] == NUMA_NO_NODE) + continue; + for (j = 0; j < ARRAY_SIZE(emu_nid_to_phys); j++) + if (__apicid_to_node[i] == emu_nid_to_phys[j]) + break; + __apicid_to_node[i] = j < ARRAY_SIZE(emu_nid_to_phys) ? j : 0; + } + + /* make sure all emulated nodes are mapped to a physical node */ + for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) + if (emu_nid_to_phys[i] == NUMA_NO_NODE) + emu_nid_to_phys[i] = dfl_phys_nid; + + /* transform distance table */ + numa_reset_distance(); + for (i = 0; i < max_emu_nid + 1; i++) { + for (j = 0; j < max_emu_nid + 1; j++) { + int physi = emu_nid_to_phys[i]; + int physj = emu_nid_to_phys[j]; + int dist; + + if (get_option(&emu_cmdline, &dist) == 2) + ; + else if (physi >= numa_dist_cnt || physj >= numa_dist_cnt) + dist = physi == physj ? + LOCAL_DISTANCE : REMOTE_DISTANCE; + else + dist = phys_dist[physi * numa_dist_cnt + physj]; + + numa_set_distance(i, j, dist); + } + } + + /* free the copied physical distance table */ + if (phys_dist) + memblock_free(__pa(phys_dist), phys_size); + return; + +no_emu: + /* No emulation. Build identity emu_nid_to_phys[] for numa_add_cpu() */ + for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) + emu_nid_to_phys[i] = i; +} + +#ifndef CONFIG_DEBUG_PER_CPU_MAPS +void numa_add_cpu(int cpu) +{ + int physnid, nid; + + nid = early_cpu_to_node(cpu); + BUG_ON(nid == NUMA_NO_NODE || !node_online(nid)); + + physnid = emu_nid_to_phys[nid]; + + /* + * Map the cpu to each emulated node that is allocated on the physical + * node of the cpu's apic id. + */ + for_each_online_node(nid) + if (emu_nid_to_phys[nid] == physnid) + cpumask_set_cpu(cpu, node_to_cpumask_map[nid]); +} + +void numa_remove_cpu(int cpu) +{ + int i; + + for_each_online_node(i) + cpumask_clear_cpu(cpu, node_to_cpumask_map[i]); +} +#else /* !CONFIG_DEBUG_PER_CPU_MAPS */ +static void numa_set_cpumask(int cpu, bool enable) +{ + int nid, physnid; + + nid = early_cpu_to_node(cpu); + if (nid == NUMA_NO_NODE) { + /* early_cpu_to_node() already emits a warning and trace */ + return; + } + + physnid = emu_nid_to_phys[nid]; + + for_each_online_node(nid) { + if (emu_nid_to_phys[nid] != physnid) + continue; + + debug_cpumask_set_cpu(cpu, nid, enable); + } +} + +void numa_add_cpu(int cpu) +{ + numa_set_cpumask(cpu, true); +} + +void numa_remove_cpu(int cpu) +{ + numa_set_cpumask(cpu, false); +} +#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ diff --git a/arch/x86/mm/numa_internal.h b/arch/x86/mm/numa_internal.h new file mode 100644 index 000000000..86860f279 --- /dev/null +++ b/arch/x86/mm/numa_internal.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __X86_MM_NUMA_INTERNAL_H +#define __X86_MM_NUMA_INTERNAL_H + +#include <linux/types.h> +#include <asm/numa.h> + +struct numa_memblk { + u64 start; + u64 end; + int nid; +}; + +struct numa_meminfo { + int nr_blks; + struct numa_memblk blk[NR_NODE_MEMBLKS]; +}; + +void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi); +int __init numa_cleanup_meminfo(struct numa_meminfo *mi); +void __init numa_reset_distance(void); + +void __init x86_numa_init(void); + +#ifdef CONFIG_NUMA_EMU +void __init numa_emulation(struct numa_meminfo *numa_meminfo, + int numa_dist_cnt); +#else +static inline void numa_emulation(struct numa_meminfo *numa_meminfo, + int numa_dist_cnt) +{ } +#endif + +#endif /* __X86_MM_NUMA_INTERNAL_H */ diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c new file mode 100644 index 000000000..a25588ad7 --- /dev/null +++ b/arch/x86/mm/pageattr-test.c @@ -0,0 +1,261 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * self test for change_page_attr. + * + * Clears the a test pte bit on random pages in the direct mapping, + * then reverts and compares page tables forwards and afterwards. + */ +#include <linux/bootmem.h> +#include <linux/kthread.h> +#include <linux/random.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/mm.h> +#include <linux/vmalloc.h> + +#include <asm/cacheflush.h> +#include <asm/pgtable.h> +#include <asm/kdebug.h> + +/* + * Only print the results of the first pass: + */ +static __read_mostly int print = 1; + +enum { + NTEST = 400, +#ifdef CONFIG_X86_64 + LPS = (1 << PMD_SHIFT), +#elif defined(CONFIG_X86_PAE) + LPS = (1 << PMD_SHIFT), +#else + LPS = (1 << 22), +#endif + GPS = (1<<30) +}; + +#define PAGE_CPA_TEST __pgprot(_PAGE_CPA_TEST) + +static int pte_testbit(pte_t pte) +{ + return pte_flags(pte) & _PAGE_SOFTW1; +} + +struct split_state { + long lpg, gpg, spg, exec; + long min_exec, max_exec; +}; + +static int print_split(struct split_state *s) +{ + long i, expected, missed = 0; + int err = 0; + + s->lpg = s->gpg = s->spg = s->exec = 0; + s->min_exec = ~0UL; + s->max_exec = 0; + for (i = 0; i < max_pfn_mapped; ) { + unsigned long addr = (unsigned long)__va(i << PAGE_SHIFT); + unsigned int level; + pte_t *pte; + + pte = lookup_address(addr, &level); + if (!pte) { + missed++; + i++; + continue; + } + + if (level == PG_LEVEL_1G && sizeof(long) == 8) { + s->gpg++; + i += GPS/PAGE_SIZE; + } else if (level == PG_LEVEL_2M) { + if ((pte_val(*pte) & _PAGE_PRESENT) && !(pte_val(*pte) & _PAGE_PSE)) { + printk(KERN_ERR + "%lx level %d but not PSE %Lx\n", + addr, level, (u64)pte_val(*pte)); + err = 1; + } + s->lpg++; + i += LPS/PAGE_SIZE; + } else { + s->spg++; + i++; + } + if (!(pte_val(*pte) & _PAGE_NX)) { + s->exec++; + if (addr < s->min_exec) + s->min_exec = addr; + if (addr > s->max_exec) + s->max_exec = addr; + } + } + if (print) { + printk(KERN_INFO + " 4k %lu large %lu gb %lu x %lu[%lx-%lx] miss %lu\n", + s->spg, s->lpg, s->gpg, s->exec, + s->min_exec != ~0UL ? s->min_exec : 0, + s->max_exec, missed); + } + + expected = (s->gpg*GPS + s->lpg*LPS)/PAGE_SIZE + s->spg + missed; + if (expected != i) { + printk(KERN_ERR "CPA max_pfn_mapped %lu but expected %lu\n", + max_pfn_mapped, expected); + return 1; + } + return err; +} + +static unsigned long addr[NTEST]; +static unsigned int len[NTEST]; + +/* Change the global bit on random pages in the direct mapping */ +static int pageattr_test(void) +{ + struct split_state sa, sb, sc; + unsigned long *bm; + pte_t *pte, pte0; + int failed = 0; + unsigned int level; + int i, k; + int err; + unsigned long test_addr; + + if (print) + printk(KERN_INFO "CPA self-test:\n"); + + bm = vzalloc((max_pfn_mapped + 7) / 8); + if (!bm) { + printk(KERN_ERR "CPA Cannot vmalloc bitmap\n"); + return -ENOMEM; + } + + failed += print_split(&sa); + + for (i = 0; i < NTEST; i++) { + unsigned long pfn = prandom_u32() % max_pfn_mapped; + + addr[i] = (unsigned long)__va(pfn << PAGE_SHIFT); + len[i] = prandom_u32() % 100; + len[i] = min_t(unsigned long, len[i], max_pfn_mapped - pfn - 1); + + if (len[i] == 0) + len[i] = 1; + + pte = NULL; + pte0 = pfn_pte(0, __pgprot(0)); /* shut gcc up */ + + for (k = 0; k < len[i]; k++) { + pte = lookup_address(addr[i] + k*PAGE_SIZE, &level); + if (!pte || pgprot_val(pte_pgprot(*pte)) == 0 || + !(pte_val(*pte) & _PAGE_PRESENT)) { + addr[i] = 0; + break; + } + if (k == 0) { + pte0 = *pte; + } else { + if (pgprot_val(pte_pgprot(*pte)) != + pgprot_val(pte_pgprot(pte0))) { + len[i] = k; + break; + } + } + if (test_bit(pfn + k, bm)) { + len[i] = k; + break; + } + __set_bit(pfn + k, bm); + } + if (!addr[i] || !pte || !k) { + addr[i] = 0; + continue; + } + + test_addr = addr[i]; + err = change_page_attr_set(&test_addr, len[i], PAGE_CPA_TEST, 0); + if (err < 0) { + printk(KERN_ERR "CPA %d failed %d\n", i, err); + failed++; + } + + pte = lookup_address(addr[i], &level); + if (!pte || !pte_testbit(*pte) || pte_huge(*pte)) { + printk(KERN_ERR "CPA %lx: bad pte %Lx\n", addr[i], + pte ? (u64)pte_val(*pte) : 0ULL); + failed++; + } + if (level != PG_LEVEL_4K) { + printk(KERN_ERR "CPA %lx: unexpected level %d\n", + addr[i], level); + failed++; + } + + } + vfree(bm); + + failed += print_split(&sb); + + for (i = 0; i < NTEST; i++) { + if (!addr[i]) + continue; + pte = lookup_address(addr[i], &level); + if (!pte) { + printk(KERN_ERR "CPA lookup of %lx failed\n", addr[i]); + failed++; + continue; + } + test_addr = addr[i]; + err = change_page_attr_clear(&test_addr, len[i], PAGE_CPA_TEST, 0); + if (err < 0) { + printk(KERN_ERR "CPA reverting failed: %d\n", err); + failed++; + } + pte = lookup_address(addr[i], &level); + if (!pte || pte_testbit(*pte)) { + printk(KERN_ERR "CPA %lx: bad pte after revert %Lx\n", + addr[i], pte ? (u64)pte_val(*pte) : 0ULL); + failed++; + } + + } + + failed += print_split(&sc); + + if (failed) { + WARN(1, KERN_ERR "NOT PASSED. Please report.\n"); + return -EINVAL; + } else { + if (print) + printk(KERN_INFO "ok.\n"); + } + + return 0; +} + +static int do_pageattr_test(void *__unused) +{ + while (!kthread_should_stop()) { + schedule_timeout_interruptible(HZ*30); + if (pageattr_test() < 0) + break; + if (print) + print--; + } + return 0; +} + +static int start_pageattr_test(void) +{ + struct task_struct *p; + + p = kthread_create(do_pageattr_test, NULL, "pageattr-test"); + if (!IS_ERR(p)) + wake_up_process(p); + else + WARN_ON(1); + + return 0; +} +device_initcall(start_pageattr_test); diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c new file mode 100644 index 000000000..101f3ad0d --- /dev/null +++ b/arch/x86/mm/pageattr.c @@ -0,0 +1,2154 @@ +/* + * Copyright 2002 Andi Kleen, SuSE Labs. + * Thanks to Ben LaHaise for precious feedback. + */ +#include <linux/highmem.h> +#include <linux/bootmem.h> +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/interrupt.h> +#include <linux/seq_file.h> +#include <linux/debugfs.h> +#include <linux/pfn.h> +#include <linux/percpu.h> +#include <linux/gfp.h> +#include <linux/pci.h> +#include <linux/vmalloc.h> + +#include <asm/e820/api.h> +#include <asm/processor.h> +#include <asm/tlbflush.h> +#include <asm/sections.h> +#include <asm/setup.h> +#include <linux/uaccess.h> +#include <asm/pgalloc.h> +#include <asm/proto.h> +#include <asm/pat.h> +#include <asm/set_memory.h> + +/* + * The current flushing context - we pass it instead of 5 arguments: + */ +struct cpa_data { + unsigned long *vaddr; + pgd_t *pgd; + pgprot_t mask_set; + pgprot_t mask_clr; + unsigned long numpages; + int flags; + unsigned long pfn; + unsigned force_split : 1; + int curpage; + struct page **pages; +}; + +/* + * Serialize cpa() (for !DEBUG_PAGEALLOC which uses large identity mappings) + * using cpa_lock. So that we don't allow any other cpu, with stale large tlb + * entries change the page attribute in parallel to some other cpu + * splitting a large page entry along with changing the attribute. + */ +static DEFINE_SPINLOCK(cpa_lock); + +#define CPA_FLUSHTLB 1 +#define CPA_ARRAY 2 +#define CPA_PAGES_ARRAY 4 +#define CPA_NO_CHECK_ALIAS 8 /* Do not search for aliases */ + +#ifdef CONFIG_PROC_FS +static unsigned long direct_pages_count[PG_LEVEL_NUM]; + +void update_page_count(int level, unsigned long pages) +{ + /* Protect against CPA */ + spin_lock(&pgd_lock); + direct_pages_count[level] += pages; + spin_unlock(&pgd_lock); +} + +static void split_page_count(int level) +{ + if (direct_pages_count[level] == 0) + return; + + direct_pages_count[level]--; + direct_pages_count[level - 1] += PTRS_PER_PTE; +} + +void arch_report_meminfo(struct seq_file *m) +{ + seq_printf(m, "DirectMap4k: %8lu kB\n", + direct_pages_count[PG_LEVEL_4K] << 2); +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) + seq_printf(m, "DirectMap2M: %8lu kB\n", + direct_pages_count[PG_LEVEL_2M] << 11); +#else + seq_printf(m, "DirectMap4M: %8lu kB\n", + direct_pages_count[PG_LEVEL_2M] << 12); +#endif + if (direct_gbpages) + seq_printf(m, "DirectMap1G: %8lu kB\n", + direct_pages_count[PG_LEVEL_1G] << 20); +} +#else +static inline void split_page_count(int level) { } +#endif + +static inline int +within(unsigned long addr, unsigned long start, unsigned long end) +{ + return addr >= start && addr < end; +} + +static inline int +within_inclusive(unsigned long addr, unsigned long start, unsigned long end) +{ + return addr >= start && addr <= end; +} + +#ifdef CONFIG_X86_64 + +static inline unsigned long highmap_start_pfn(void) +{ + return __pa_symbol(_text) >> PAGE_SHIFT; +} + +static inline unsigned long highmap_end_pfn(void) +{ + /* Do not reference physical address outside the kernel. */ + return __pa_symbol(roundup(_brk_end, PMD_SIZE) - 1) >> PAGE_SHIFT; +} + +static bool __cpa_pfn_in_highmap(unsigned long pfn) +{ + /* + * Kernel text has an alias mapping at a high address, known + * here as "highmap". + */ + return within_inclusive(pfn, highmap_start_pfn(), highmap_end_pfn()); +} + +#else + +static bool __cpa_pfn_in_highmap(unsigned long pfn) +{ + /* There is no highmap on 32-bit */ + return false; +} + +#endif + +/* + * Flushing functions + */ + +/** + * clflush_cache_range - flush a cache range with clflush + * @vaddr: virtual start address + * @size: number of bytes to flush + * + * clflushopt is an unordered instruction which needs fencing with mfence or + * sfence to avoid ordering issues. + */ +void clflush_cache_range(void *vaddr, unsigned int size) +{ + const unsigned long clflush_size = boot_cpu_data.x86_clflush_size; + void *p = (void *)((unsigned long)vaddr & ~(clflush_size - 1)); + void *vend = vaddr + size; + + if (p >= vend) + return; + + mb(); + + for (; p < vend; p += clflush_size) + clflushopt(p); + + mb(); +} +EXPORT_SYMBOL_GPL(clflush_cache_range); + +void arch_invalidate_pmem(void *addr, size_t size) +{ + clflush_cache_range(addr, size); +} +EXPORT_SYMBOL_GPL(arch_invalidate_pmem); + +static void __cpa_flush_all(void *arg) +{ + unsigned long cache = (unsigned long)arg; + + /* + * Flush all to work around Errata in early athlons regarding + * large page flushing. + */ + __flush_tlb_all(); + + if (cache && boot_cpu_data.x86 >= 4) + wbinvd(); +} + +static void cpa_flush_all(unsigned long cache) +{ + BUG_ON(irqs_disabled() && !early_boot_irqs_disabled); + + on_each_cpu(__cpa_flush_all, (void *) cache, 1); +} + +static void __cpa_flush_range(void *arg) +{ + /* + * We could optimize that further and do individual per page + * tlb invalidates for a low number of pages. Caveat: we must + * flush the high aliases on 64bit as well. + */ + __flush_tlb_all(); +} + +static void cpa_flush_range(unsigned long start, int numpages, int cache) +{ + unsigned int i, level; + unsigned long addr; + + BUG_ON(irqs_disabled() && !early_boot_irqs_disabled); + WARN_ON(PAGE_ALIGN(start) != start); + + on_each_cpu(__cpa_flush_range, NULL, 1); + + if (!cache) + return; + + /* + * We only need to flush on one CPU, + * clflush is a MESI-coherent instruction that + * will cause all other CPUs to flush the same + * cachelines: + */ + for (i = 0, addr = start; i < numpages; i++, addr += PAGE_SIZE) { + pte_t *pte = lookup_address(addr, &level); + + /* + * Only flush present addresses: + */ + if (pte && (pte_val(*pte) & _PAGE_PRESENT)) + clflush_cache_range((void *) addr, PAGE_SIZE); + } +} + +static void cpa_flush_array(unsigned long *start, int numpages, int cache, + int in_flags, struct page **pages) +{ + unsigned int i, level; +#ifdef CONFIG_PREEMPT + /* + * Avoid wbinvd() because it causes latencies on all CPUs, + * regardless of any CPU isolation that may be in effect. + * + * This should be extended for CAT enabled systems independent of + * PREEMPT because wbinvd() does not respect the CAT partitions and + * this is exposed to unpriviledged users through the graphics + * subsystem. + */ + unsigned long do_wbinvd = 0; +#else + unsigned long do_wbinvd = cache && numpages >= 1024; /* 4M threshold */ +#endif + + BUG_ON(irqs_disabled() && !early_boot_irqs_disabled); + + on_each_cpu(__cpa_flush_all, (void *) do_wbinvd, 1); + + if (!cache || do_wbinvd) + return; + + /* + * We only need to flush on one CPU, + * clflush is a MESI-coherent instruction that + * will cause all other CPUs to flush the same + * cachelines: + */ + for (i = 0; i < numpages; i++) { + unsigned long addr; + pte_t *pte; + + if (in_flags & CPA_PAGES_ARRAY) + addr = (unsigned long)page_address(pages[i]); + else + addr = start[i]; + + pte = lookup_address(addr, &level); + + /* + * Only flush present addresses: + */ + if (pte && (pte_val(*pte) & _PAGE_PRESENT)) + clflush_cache_range((void *)addr, PAGE_SIZE); + } +} + +/* + * Certain areas of memory on x86 require very specific protection flags, + * for example the BIOS area or kernel text. Callers don't always get this + * right (again, ioremap() on BIOS memory is not uncommon) so this function + * checks and fixes these known static required protection bits. + */ +static inline pgprot_t static_protections(pgprot_t prot, unsigned long address, + unsigned long pfn) +{ + pgprot_t forbidden = __pgprot(0); + + /* + * The BIOS area between 640k and 1Mb needs to be executable for + * PCI BIOS based config access (CONFIG_PCI_GOBIOS) support. + */ +#ifdef CONFIG_PCI_BIOS + if (pcibios_enabled && within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT)) + pgprot_val(forbidden) |= _PAGE_NX; +#endif + + /* + * The kernel text needs to be executable for obvious reasons + * Does not cover __inittext since that is gone later on. On + * 64bit we do not enforce !NX on the low mapping + */ + if (within(address, (unsigned long)_text, (unsigned long)_etext)) + pgprot_val(forbidden) |= _PAGE_NX; + + /* + * The .rodata section needs to be read-only. Using the pfn + * catches all aliases. This also includes __ro_after_init, + * so do not enforce until kernel_set_to_readonly is true. + */ + if (kernel_set_to_readonly && + within(pfn, __pa_symbol(__start_rodata) >> PAGE_SHIFT, + __pa_symbol(__end_rodata) >> PAGE_SHIFT)) + pgprot_val(forbidden) |= _PAGE_RW; + +#if defined(CONFIG_X86_64) + /* + * Once the kernel maps the text as RO (kernel_set_to_readonly is set), + * kernel text mappings for the large page aligned text, rodata sections + * will be always read-only. For the kernel identity mappings covering + * the holes caused by this alignment can be anything that user asks. + * + * This will preserve the large page mappings for kernel text/data + * at no extra cost. + */ + if (kernel_set_to_readonly && + within(address, (unsigned long)_text, + (unsigned long)__end_rodata_hpage_align)) { + unsigned int level; + + /* + * Don't enforce the !RW mapping for the kernel text mapping, + * if the current mapping is already using small page mapping. + * No need to work hard to preserve large page mappings in this + * case. + * + * This also fixes the Linux Xen paravirt guest boot failure + * (because of unexpected read-only mappings for kernel identity + * mappings). In this paravirt guest case, the kernel text + * mapping and the kernel identity mapping share the same + * page-table pages. Thus we can't really use different + * protections for the kernel text and identity mappings. Also, + * these shared mappings are made of small page mappings. + * Thus this don't enforce !RW mapping for small page kernel + * text mapping logic will help Linux Xen parvirt guest boot + * as well. + */ + if (lookup_address(address, &level) && (level != PG_LEVEL_4K)) + pgprot_val(forbidden) |= _PAGE_RW; + } +#endif + + prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden)); + + return prot; +} + +/* + * Lookup the page table entry for a virtual address in a specific pgd. + * Return a pointer to the entry and the level of the mapping. + */ +pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address, + unsigned int *level) +{ + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + + *level = PG_LEVEL_NONE; + + if (pgd_none(*pgd)) + return NULL; + + p4d = p4d_offset(pgd, address); + if (p4d_none(*p4d)) + return NULL; + + *level = PG_LEVEL_512G; + if (p4d_large(*p4d) || !p4d_present(*p4d)) + return (pte_t *)p4d; + + pud = pud_offset(p4d, address); + if (pud_none(*pud)) + return NULL; + + *level = PG_LEVEL_1G; + if (pud_large(*pud) || !pud_present(*pud)) + return (pte_t *)pud; + + pmd = pmd_offset(pud, address); + if (pmd_none(*pmd)) + return NULL; + + *level = PG_LEVEL_2M; + if (pmd_large(*pmd) || !pmd_present(*pmd)) + return (pte_t *)pmd; + + *level = PG_LEVEL_4K; + + return pte_offset_kernel(pmd, address); +} + +/* + * Lookup the page table entry for a virtual address. Return a pointer + * to the entry and the level of the mapping. + * + * Note: We return pud and pmd either when the entry is marked large + * or when the present bit is not set. Otherwise we would return a + * pointer to a nonexisting mapping. + */ +pte_t *lookup_address(unsigned long address, unsigned int *level) +{ + return lookup_address_in_pgd(pgd_offset_k(address), address, level); +} +EXPORT_SYMBOL_GPL(lookup_address); + +static pte_t *_lookup_address_cpa(struct cpa_data *cpa, unsigned long address, + unsigned int *level) +{ + if (cpa->pgd) + return lookup_address_in_pgd(cpa->pgd + pgd_index(address), + address, level); + + return lookup_address(address, level); +} + +/* + * Lookup the PMD entry for a virtual address. Return a pointer to the entry + * or NULL if not present. + */ +pmd_t *lookup_pmd_address(unsigned long address) +{ + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + + pgd = pgd_offset_k(address); + if (pgd_none(*pgd)) + return NULL; + + p4d = p4d_offset(pgd, address); + if (p4d_none(*p4d) || p4d_large(*p4d) || !p4d_present(*p4d)) + return NULL; + + pud = pud_offset(p4d, address); + if (pud_none(*pud) || pud_large(*pud) || !pud_present(*pud)) + return NULL; + + return pmd_offset(pud, address); +} + +/* + * This is necessary because __pa() does not work on some + * kinds of memory, like vmalloc() or the alloc_remap() + * areas on 32-bit NUMA systems. The percpu areas can + * end up in this kind of memory, for instance. + * + * This could be optimized, but it is only intended to be + * used at inititalization time, and keeping it + * unoptimized should increase the testing coverage for + * the more obscure platforms. + */ +phys_addr_t slow_virt_to_phys(void *__virt_addr) +{ + unsigned long virt_addr = (unsigned long)__virt_addr; + phys_addr_t phys_addr; + unsigned long offset; + enum pg_level level; + pte_t *pte; + + pte = lookup_address(virt_addr, &level); + BUG_ON(!pte); + + /* + * pXX_pfn() returns unsigned long, which must be cast to phys_addr_t + * before being left-shifted PAGE_SHIFT bits -- this trick is to + * make 32-PAE kernel work correctly. + */ + switch (level) { + case PG_LEVEL_1G: + phys_addr = (phys_addr_t)pud_pfn(*(pud_t *)pte) << PAGE_SHIFT; + offset = virt_addr & ~PUD_PAGE_MASK; + break; + case PG_LEVEL_2M: + phys_addr = (phys_addr_t)pmd_pfn(*(pmd_t *)pte) << PAGE_SHIFT; + offset = virt_addr & ~PMD_PAGE_MASK; + break; + default: + phys_addr = (phys_addr_t)pte_pfn(*pte) << PAGE_SHIFT; + offset = virt_addr & ~PAGE_MASK; + } + + return (phys_addr_t)(phys_addr | offset); +} +EXPORT_SYMBOL_GPL(slow_virt_to_phys); + +/* + * Set the new pmd in all the pgds we know about: + */ +static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte) +{ + /* change init_mm */ + set_pte_atomic(kpte, pte); +#ifdef CONFIG_X86_32 + if (!SHARED_KERNEL_PMD) { + struct page *page; + + list_for_each_entry(page, &pgd_list, lru) { + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + + pgd = (pgd_t *)page_address(page) + pgd_index(address); + p4d = p4d_offset(pgd, address); + pud = pud_offset(p4d, address); + pmd = pmd_offset(pud, address); + set_pte_atomic((pte_t *)pmd, pte); + } + } +#endif +} + +static pgprot_t pgprot_clear_protnone_bits(pgprot_t prot) +{ + /* + * _PAGE_GLOBAL means "global page" for present PTEs. + * But, it is also used to indicate _PAGE_PROTNONE + * for non-present PTEs. + * + * This ensures that a _PAGE_GLOBAL PTE going from + * present to non-present is not confused as + * _PAGE_PROTNONE. + */ + if (!(pgprot_val(prot) & _PAGE_PRESENT)) + pgprot_val(prot) &= ~_PAGE_GLOBAL; + + return prot; +} + +static int +try_preserve_large_page(pte_t *kpte, unsigned long address, + struct cpa_data *cpa) +{ + unsigned long nextpage_addr, numpages, pmask, psize, addr, pfn, old_pfn; + pte_t new_pte, old_pte, *tmp; + pgprot_t old_prot, new_prot, req_prot; + int i, do_split = 1; + enum pg_level level; + + if (cpa->force_split) + return 1; + + spin_lock(&pgd_lock); + /* + * Check for races, another CPU might have split this page + * up already: + */ + tmp = _lookup_address_cpa(cpa, address, &level); + if (tmp != kpte) + goto out_unlock; + + switch (level) { + case PG_LEVEL_2M: + old_prot = pmd_pgprot(*(pmd_t *)kpte); + old_pfn = pmd_pfn(*(pmd_t *)kpte); + break; + case PG_LEVEL_1G: + old_prot = pud_pgprot(*(pud_t *)kpte); + old_pfn = pud_pfn(*(pud_t *)kpte); + break; + default: + do_split = -EINVAL; + goto out_unlock; + } + + psize = page_level_size(level); + pmask = page_level_mask(level); + + /* + * Calculate the number of pages, which fit into this large + * page starting at address: + */ + nextpage_addr = (address + psize) & pmask; + numpages = (nextpage_addr - address) >> PAGE_SHIFT; + if (numpages < cpa->numpages) + cpa->numpages = numpages; + + /* + * We are safe now. Check whether the new pgprot is the same: + * Convert protection attributes to 4k-format, as cpa->mask* are set + * up accordingly. + */ + old_pte = *kpte; + /* Clear PSE (aka _PAGE_PAT) and move PAT bit to correct position */ + req_prot = pgprot_large_2_4k(old_prot); + + pgprot_val(req_prot) &= ~pgprot_val(cpa->mask_clr); + pgprot_val(req_prot) |= pgprot_val(cpa->mask_set); + + /* + * req_prot is in format of 4k pages. It must be converted to large + * page format: the caching mode includes the PAT bit located at + * different bit positions in the two formats. + */ + req_prot = pgprot_4k_2_large(req_prot); + req_prot = pgprot_clear_protnone_bits(req_prot); + if (pgprot_val(req_prot) & _PAGE_PRESENT) + pgprot_val(req_prot) |= _PAGE_PSE; + + /* + * old_pfn points to the large page base pfn. So we need + * to add the offset of the virtual address: + */ + pfn = old_pfn + ((address & (psize - 1)) >> PAGE_SHIFT); + cpa->pfn = pfn; + + new_prot = static_protections(req_prot, address, pfn); + + /* + * We need to check the full range, whether + * static_protection() requires a different pgprot for one of + * the pages in the range we try to preserve: + */ + addr = address & pmask; + pfn = old_pfn; + for (i = 0; i < (psize >> PAGE_SHIFT); i++, addr += PAGE_SIZE, pfn++) { + pgprot_t chk_prot = static_protections(req_prot, addr, pfn); + + if (pgprot_val(chk_prot) != pgprot_val(new_prot)) + goto out_unlock; + } + + /* + * If there are no changes, return. maxpages has been updated + * above: + */ + if (pgprot_val(new_prot) == pgprot_val(old_prot)) { + do_split = 0; + goto out_unlock; + } + + /* + * We need to change the attributes. Check, whether we can + * change the large page in one go. We request a split, when + * the address is not aligned and the number of pages is + * smaller than the number of pages in the large page. Note + * that we limited the number of possible pages already to + * the number of pages in the large page. + */ + if (address == (address & pmask) && cpa->numpages == (psize >> PAGE_SHIFT)) { + /* + * The address is aligned and the number of pages + * covers the full page. + */ + new_pte = pfn_pte(old_pfn, new_prot); + __set_pmd_pte(kpte, address, new_pte); + cpa->flags |= CPA_FLUSHTLB; + do_split = 0; + } + +out_unlock: + spin_unlock(&pgd_lock); + + return do_split; +} + +static int +__split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address, + struct page *base) +{ + pte_t *pbase = (pte_t *)page_address(base); + unsigned long ref_pfn, pfn, pfninc = 1; + unsigned int i, level; + pte_t *tmp; + pgprot_t ref_prot; + + spin_lock(&pgd_lock); + /* + * Check for races, another CPU might have split this page + * up for us already: + */ + tmp = _lookup_address_cpa(cpa, address, &level); + if (tmp != kpte) { + spin_unlock(&pgd_lock); + return 1; + } + + paravirt_alloc_pte(&init_mm, page_to_pfn(base)); + + switch (level) { + case PG_LEVEL_2M: + ref_prot = pmd_pgprot(*(pmd_t *)kpte); + /* + * Clear PSE (aka _PAGE_PAT) and move + * PAT bit to correct position. + */ + ref_prot = pgprot_large_2_4k(ref_prot); + + ref_pfn = pmd_pfn(*(pmd_t *)kpte); + break; + + case PG_LEVEL_1G: + ref_prot = pud_pgprot(*(pud_t *)kpte); + ref_pfn = pud_pfn(*(pud_t *)kpte); + pfninc = PMD_PAGE_SIZE >> PAGE_SHIFT; + + /* + * Clear the PSE flags if the PRESENT flag is not set + * otherwise pmd_present/pmd_huge will return true + * even on a non present pmd. + */ + if (!(pgprot_val(ref_prot) & _PAGE_PRESENT)) + pgprot_val(ref_prot) &= ~_PAGE_PSE; + break; + + default: + spin_unlock(&pgd_lock); + return 1; + } + + ref_prot = pgprot_clear_protnone_bits(ref_prot); + + /* + * Get the target pfn from the original entry: + */ + pfn = ref_pfn; + for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc) + set_pte(&pbase[i], pfn_pte(pfn, ref_prot)); + + if (virt_addr_valid(address)) { + unsigned long pfn = PFN_DOWN(__pa(address)); + + if (pfn_range_is_mapped(pfn, pfn + 1)) + split_page_count(level); + } + + /* + * Install the new, split up pagetable. + * + * We use the standard kernel pagetable protections for the new + * pagetable protections, the actual ptes set above control the + * primary protection behavior: + */ + __set_pmd_pte(kpte, address, mk_pte(base, __pgprot(_KERNPG_TABLE))); + + /* + * Intel Atom errata AAH41 workaround. + * + * The real fix should be in hw or in a microcode update, but + * we also probabilistically try to reduce the window of having + * a large TLB mixed with 4K TLBs while instruction fetches are + * going on. + */ + __flush_tlb_all(); + spin_unlock(&pgd_lock); + + return 0; +} + +static int split_large_page(struct cpa_data *cpa, pte_t *kpte, + unsigned long address) +{ + struct page *base; + + if (!debug_pagealloc_enabled()) + spin_unlock(&cpa_lock); + base = alloc_pages(GFP_KERNEL, 0); + if (!debug_pagealloc_enabled()) + spin_lock(&cpa_lock); + if (!base) + return -ENOMEM; + + if (__split_large_page(cpa, kpte, address, base)) + __free_page(base); + + return 0; +} + +static bool try_to_free_pte_page(pte_t *pte) +{ + int i; + + for (i = 0; i < PTRS_PER_PTE; i++) + if (!pte_none(pte[i])) + return false; + + free_page((unsigned long)pte); + return true; +} + +static bool try_to_free_pmd_page(pmd_t *pmd) +{ + int i; + + for (i = 0; i < PTRS_PER_PMD; i++) + if (!pmd_none(pmd[i])) + return false; + + free_page((unsigned long)pmd); + return true; +} + +static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end) +{ + pte_t *pte = pte_offset_kernel(pmd, start); + + while (start < end) { + set_pte(pte, __pte(0)); + + start += PAGE_SIZE; + pte++; + } + + if (try_to_free_pte_page((pte_t *)pmd_page_vaddr(*pmd))) { + pmd_clear(pmd); + return true; + } + return false; +} + +static void __unmap_pmd_range(pud_t *pud, pmd_t *pmd, + unsigned long start, unsigned long end) +{ + if (unmap_pte_range(pmd, start, end)) + if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud))) + pud_clear(pud); +} + +static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end) +{ + pmd_t *pmd = pmd_offset(pud, start); + + /* + * Not on a 2MB page boundary? + */ + if (start & (PMD_SIZE - 1)) { + unsigned long next_page = (start + PMD_SIZE) & PMD_MASK; + unsigned long pre_end = min_t(unsigned long, end, next_page); + + __unmap_pmd_range(pud, pmd, start, pre_end); + + start = pre_end; + pmd++; + } + + /* + * Try to unmap in 2M chunks. + */ + while (end - start >= PMD_SIZE) { + if (pmd_large(*pmd)) + pmd_clear(pmd); + else + __unmap_pmd_range(pud, pmd, start, start + PMD_SIZE); + + start += PMD_SIZE; + pmd++; + } + + /* + * 4K leftovers? + */ + if (start < end) + return __unmap_pmd_range(pud, pmd, start, end); + + /* + * Try again to free the PMD page if haven't succeeded above. + */ + if (!pud_none(*pud)) + if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud))) + pud_clear(pud); +} + +static void unmap_pud_range(p4d_t *p4d, unsigned long start, unsigned long end) +{ + pud_t *pud = pud_offset(p4d, start); + + /* + * Not on a GB page boundary? + */ + if (start & (PUD_SIZE - 1)) { + unsigned long next_page = (start + PUD_SIZE) & PUD_MASK; + unsigned long pre_end = min_t(unsigned long, end, next_page); + + unmap_pmd_range(pud, start, pre_end); + + start = pre_end; + pud++; + } + + /* + * Try to unmap in 1G chunks? + */ + while (end - start >= PUD_SIZE) { + + if (pud_large(*pud)) + pud_clear(pud); + else + unmap_pmd_range(pud, start, start + PUD_SIZE); + + start += PUD_SIZE; + pud++; + } + + /* + * 2M leftovers? + */ + if (start < end) + unmap_pmd_range(pud, start, end); + + /* + * No need to try to free the PUD page because we'll free it in + * populate_pgd's error path + */ +} + +static int alloc_pte_page(pmd_t *pmd) +{ + pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL); + if (!pte) + return -1; + + set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE)); + return 0; +} + +static int alloc_pmd_page(pud_t *pud) +{ + pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL); + if (!pmd) + return -1; + + set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); + return 0; +} + +static void populate_pte(struct cpa_data *cpa, + unsigned long start, unsigned long end, + unsigned num_pages, pmd_t *pmd, pgprot_t pgprot) +{ + pte_t *pte; + + pte = pte_offset_kernel(pmd, start); + + pgprot = pgprot_clear_protnone_bits(pgprot); + + while (num_pages-- && start < end) { + set_pte(pte, pfn_pte(cpa->pfn, pgprot)); + + start += PAGE_SIZE; + cpa->pfn++; + pte++; + } +} + +static long populate_pmd(struct cpa_data *cpa, + unsigned long start, unsigned long end, + unsigned num_pages, pud_t *pud, pgprot_t pgprot) +{ + long cur_pages = 0; + pmd_t *pmd; + pgprot_t pmd_pgprot; + + /* + * Not on a 2M boundary? + */ + if (start & (PMD_SIZE - 1)) { + unsigned long pre_end = start + (num_pages << PAGE_SHIFT); + unsigned long next_page = (start + PMD_SIZE) & PMD_MASK; + + pre_end = min_t(unsigned long, pre_end, next_page); + cur_pages = (pre_end - start) >> PAGE_SHIFT; + cur_pages = min_t(unsigned int, num_pages, cur_pages); + + /* + * Need a PTE page? + */ + pmd = pmd_offset(pud, start); + if (pmd_none(*pmd)) + if (alloc_pte_page(pmd)) + return -1; + + populate_pte(cpa, start, pre_end, cur_pages, pmd, pgprot); + + start = pre_end; + } + + /* + * We mapped them all? + */ + if (num_pages == cur_pages) + return cur_pages; + + pmd_pgprot = pgprot_4k_2_large(pgprot); + + while (end - start >= PMD_SIZE) { + + /* + * We cannot use a 1G page so allocate a PMD page if needed. + */ + if (pud_none(*pud)) + if (alloc_pmd_page(pud)) + return -1; + + pmd = pmd_offset(pud, start); + + set_pmd(pmd, pmd_mkhuge(pfn_pmd(cpa->pfn, + canon_pgprot(pmd_pgprot)))); + + start += PMD_SIZE; + cpa->pfn += PMD_SIZE >> PAGE_SHIFT; + cur_pages += PMD_SIZE >> PAGE_SHIFT; + } + + /* + * Map trailing 4K pages. + */ + if (start < end) { + pmd = pmd_offset(pud, start); + if (pmd_none(*pmd)) + if (alloc_pte_page(pmd)) + return -1; + + populate_pte(cpa, start, end, num_pages - cur_pages, + pmd, pgprot); + } + return num_pages; +} + +static int populate_pud(struct cpa_data *cpa, unsigned long start, p4d_t *p4d, + pgprot_t pgprot) +{ + pud_t *pud; + unsigned long end; + long cur_pages = 0; + pgprot_t pud_pgprot; + + end = start + (cpa->numpages << PAGE_SHIFT); + + /* + * Not on a Gb page boundary? => map everything up to it with + * smaller pages. + */ + if (start & (PUD_SIZE - 1)) { + unsigned long pre_end; + unsigned long next_page = (start + PUD_SIZE) & PUD_MASK; + + pre_end = min_t(unsigned long, end, next_page); + cur_pages = (pre_end - start) >> PAGE_SHIFT; + cur_pages = min_t(int, (int)cpa->numpages, cur_pages); + + pud = pud_offset(p4d, start); + + /* + * Need a PMD page? + */ + if (pud_none(*pud)) + if (alloc_pmd_page(pud)) + return -1; + + cur_pages = populate_pmd(cpa, start, pre_end, cur_pages, + pud, pgprot); + if (cur_pages < 0) + return cur_pages; + + start = pre_end; + } + + /* We mapped them all? */ + if (cpa->numpages == cur_pages) + return cur_pages; + + pud = pud_offset(p4d, start); + pud_pgprot = pgprot_4k_2_large(pgprot); + + /* + * Map everything starting from the Gb boundary, possibly with 1G pages + */ + while (boot_cpu_has(X86_FEATURE_GBPAGES) && end - start >= PUD_SIZE) { + set_pud(pud, pud_mkhuge(pfn_pud(cpa->pfn, + canon_pgprot(pud_pgprot)))); + + start += PUD_SIZE; + cpa->pfn += PUD_SIZE >> PAGE_SHIFT; + cur_pages += PUD_SIZE >> PAGE_SHIFT; + pud++; + } + + /* Map trailing leftover */ + if (start < end) { + long tmp; + + pud = pud_offset(p4d, start); + if (pud_none(*pud)) + if (alloc_pmd_page(pud)) + return -1; + + tmp = populate_pmd(cpa, start, end, cpa->numpages - cur_pages, + pud, pgprot); + if (tmp < 0) + return cur_pages; + + cur_pages += tmp; + } + return cur_pages; +} + +/* + * Restrictions for kernel page table do not necessarily apply when mapping in + * an alternate PGD. + */ +static int populate_pgd(struct cpa_data *cpa, unsigned long addr) +{ + pgprot_t pgprot = __pgprot(_KERNPG_TABLE); + pud_t *pud = NULL; /* shut up gcc */ + p4d_t *p4d; + pgd_t *pgd_entry; + long ret; + + pgd_entry = cpa->pgd + pgd_index(addr); + + if (pgd_none(*pgd_entry)) { + p4d = (p4d_t *)get_zeroed_page(GFP_KERNEL); + if (!p4d) + return -1; + + set_pgd(pgd_entry, __pgd(__pa(p4d) | _KERNPG_TABLE)); + } + + /* + * Allocate a PUD page and hand it down for mapping. + */ + p4d = p4d_offset(pgd_entry, addr); + if (p4d_none(*p4d)) { + pud = (pud_t *)get_zeroed_page(GFP_KERNEL); + if (!pud) + return -1; + + set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE)); + } + + pgprot_val(pgprot) &= ~pgprot_val(cpa->mask_clr); + pgprot_val(pgprot) |= pgprot_val(cpa->mask_set); + + ret = populate_pud(cpa, addr, p4d, pgprot); + if (ret < 0) { + /* + * Leave the PUD page in place in case some other CPU or thread + * already found it, but remove any useless entries we just + * added to it. + */ + unmap_pud_range(p4d, addr, + addr + (cpa->numpages << PAGE_SHIFT)); + return ret; + } + + cpa->numpages = ret; + return 0; +} + +static int __cpa_process_fault(struct cpa_data *cpa, unsigned long vaddr, + int primary) +{ + if (cpa->pgd) { + /* + * Right now, we only execute this code path when mapping + * the EFI virtual memory map regions, no other users + * provide a ->pgd value. This may change in the future. + */ + return populate_pgd(cpa, vaddr); + } + + /* + * Ignore all non primary paths. + */ + if (!primary) { + cpa->numpages = 1; + return 0; + } + + /* + * Ignore the NULL PTE for kernel identity mapping, as it is expected + * to have holes. + * Also set numpages to '1' indicating that we processed cpa req for + * one virtual address page and its pfn. TBD: numpages can be set based + * on the initial value and the level returned by lookup_address(). + */ + if (within(vaddr, PAGE_OFFSET, + PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))) { + cpa->numpages = 1; + cpa->pfn = __pa(vaddr) >> PAGE_SHIFT; + return 0; + + } else if (__cpa_pfn_in_highmap(cpa->pfn)) { + /* Faults in the highmap are OK, so do not warn: */ + return -EFAULT; + } else { + WARN(1, KERN_WARNING "CPA: called for zero pte. " + "vaddr = %lx cpa->vaddr = %lx\n", vaddr, + *cpa->vaddr); + + return -EFAULT; + } +} + +static int __change_page_attr(struct cpa_data *cpa, int primary) +{ + unsigned long address; + int do_split, err; + unsigned int level; + pte_t *kpte, old_pte; + + if (cpa->flags & CPA_PAGES_ARRAY) { + struct page *page = cpa->pages[cpa->curpage]; + if (unlikely(PageHighMem(page))) + return 0; + address = (unsigned long)page_address(page); + } else if (cpa->flags & CPA_ARRAY) + address = cpa->vaddr[cpa->curpage]; + else + address = *cpa->vaddr; +repeat: + kpte = _lookup_address_cpa(cpa, address, &level); + if (!kpte) + return __cpa_process_fault(cpa, address, primary); + + old_pte = *kpte; + if (pte_none(old_pte)) + return __cpa_process_fault(cpa, address, primary); + + if (level == PG_LEVEL_4K) { + pte_t new_pte; + pgprot_t new_prot = pte_pgprot(old_pte); + unsigned long pfn = pte_pfn(old_pte); + + pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr); + pgprot_val(new_prot) |= pgprot_val(cpa->mask_set); + + new_prot = static_protections(new_prot, address, pfn); + + new_prot = pgprot_clear_protnone_bits(new_prot); + + /* + * We need to keep the pfn from the existing PTE, + * after all we're only going to change it's attributes + * not the memory it points to + */ + new_pte = pfn_pte(pfn, new_prot); + cpa->pfn = pfn; + /* + * Do we really change anything ? + */ + if (pte_val(old_pte) != pte_val(new_pte)) { + set_pte_atomic(kpte, new_pte); + cpa->flags |= CPA_FLUSHTLB; + } + cpa->numpages = 1; + return 0; + } + + /* + * Check, whether we can keep the large page intact + * and just change the pte: + */ + do_split = try_preserve_large_page(kpte, address, cpa); + /* + * When the range fits into the existing large page, + * return. cp->numpages and cpa->tlbflush have been updated in + * try_large_page: + */ + if (do_split <= 0) + return do_split; + + /* + * We have to split the large page: + */ + err = split_large_page(cpa, kpte, address); + if (!err) { + /* + * Do a global flush tlb after splitting the large page + * and before we do the actual change page attribute in the PTE. + * + * With out this, we violate the TLB application note, that says + * "The TLBs may contain both ordinary and large-page + * translations for a 4-KByte range of linear addresses. This + * may occur if software modifies the paging structures so that + * the page size used for the address range changes. If the two + * translations differ with respect to page frame or attributes + * (e.g., permissions), processor behavior is undefined and may + * be implementation-specific." + * + * We do this global tlb flush inside the cpa_lock, so that we + * don't allow any other cpu, with stale tlb entries change the + * page attribute in parallel, that also falls into the + * just split large page entry. + */ + flush_tlb_all(); + goto repeat; + } + + return err; +} + +static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias); + +static int cpa_process_alias(struct cpa_data *cpa) +{ + struct cpa_data alias_cpa; + unsigned long laddr = (unsigned long)__va(cpa->pfn << PAGE_SHIFT); + unsigned long vaddr; + int ret; + + if (!pfn_range_is_mapped(cpa->pfn, cpa->pfn + 1)) + return 0; + + /* + * No need to redo, when the primary call touched the direct + * mapping already: + */ + if (cpa->flags & CPA_PAGES_ARRAY) { + struct page *page = cpa->pages[cpa->curpage]; + if (unlikely(PageHighMem(page))) + return 0; + vaddr = (unsigned long)page_address(page); + } else if (cpa->flags & CPA_ARRAY) + vaddr = cpa->vaddr[cpa->curpage]; + else + vaddr = *cpa->vaddr; + + if (!(within(vaddr, PAGE_OFFSET, + PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT)))) { + + alias_cpa = *cpa; + alias_cpa.vaddr = &laddr; + alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY); + + ret = __change_page_attr_set_clr(&alias_cpa, 0); + if (ret) + return ret; + } + +#ifdef CONFIG_X86_64 + /* + * If the primary call didn't touch the high mapping already + * and the physical address is inside the kernel map, we need + * to touch the high mapped kernel as well: + */ + if (!within(vaddr, (unsigned long)_text, _brk_end) && + __cpa_pfn_in_highmap(cpa->pfn)) { + unsigned long temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) + + __START_KERNEL_map - phys_base; + alias_cpa = *cpa; + alias_cpa.vaddr = &temp_cpa_vaddr; + alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY); + + /* + * The high mapping range is imprecise, so ignore the + * return value. + */ + __change_page_attr_set_clr(&alias_cpa, 0); + } +#endif + + return 0; +} + +static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias) +{ + unsigned long numpages = cpa->numpages; + int ret; + + while (numpages) { + /* + * Store the remaining nr of pages for the large page + * preservation check. + */ + cpa->numpages = numpages; + /* for array changes, we can't use large page */ + if (cpa->flags & (CPA_ARRAY | CPA_PAGES_ARRAY)) + cpa->numpages = 1; + + if (!debug_pagealloc_enabled()) + spin_lock(&cpa_lock); + ret = __change_page_attr(cpa, checkalias); + if (!debug_pagealloc_enabled()) + spin_unlock(&cpa_lock); + if (ret) + return ret; + + if (checkalias) { + ret = cpa_process_alias(cpa); + if (ret) + return ret; + } + + /* + * Adjust the number of pages with the result of the + * CPA operation. Either a large page has been + * preserved or a single page update happened. + */ + BUG_ON(cpa->numpages > numpages || !cpa->numpages); + numpages -= cpa->numpages; + if (cpa->flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) + cpa->curpage++; + else + *cpa->vaddr += cpa->numpages * PAGE_SIZE; + + } + return 0; +} + +/* + * Machine check recovery code needs to change cache mode of poisoned + * pages to UC to avoid speculative access logging another error. But + * passing the address of the 1:1 mapping to set_memory_uc() is a fine + * way to encourage a speculative access. So we cheat and flip the top + * bit of the address. This works fine for the code that updates the + * page tables. But at the end of the process we need to flush the cache + * and the non-canonical address causes a #GP fault when used by the + * CLFLUSH instruction. + * + * But in the common case we already have a canonical address. This code + * will fix the top bit if needed and is a no-op otherwise. + */ +static inline unsigned long make_addr_canonical_again(unsigned long addr) +{ +#ifdef CONFIG_X86_64 + return (long)(addr << 1) >> 1; +#else + return addr; +#endif +} + + +static int change_page_attr_set_clr(unsigned long *addr, int numpages, + pgprot_t mask_set, pgprot_t mask_clr, + int force_split, int in_flag, + struct page **pages) +{ + struct cpa_data cpa; + int ret, cache, checkalias; + unsigned long baddr = 0; + + memset(&cpa, 0, sizeof(cpa)); + + /* + * Check, if we are requested to set a not supported + * feature. Clearing non-supported features is OK. + */ + mask_set = canon_pgprot(mask_set); + + if (!pgprot_val(mask_set) && !pgprot_val(mask_clr) && !force_split) + return 0; + + /* Ensure we are PAGE_SIZE aligned */ + if (in_flag & CPA_ARRAY) { + int i; + for (i = 0; i < numpages; i++) { + if (addr[i] & ~PAGE_MASK) { + addr[i] &= PAGE_MASK; + WARN_ON_ONCE(1); + } + } + } else if (!(in_flag & CPA_PAGES_ARRAY)) { + /* + * in_flag of CPA_PAGES_ARRAY implies it is aligned. + * No need to cehck in that case + */ + if (*addr & ~PAGE_MASK) { + *addr &= PAGE_MASK; + /* + * People should not be passing in unaligned addresses: + */ + WARN_ON_ONCE(1); + } + /* + * Save address for cache flush. *addr is modified in the call + * to __change_page_attr_set_clr() below. + */ + baddr = make_addr_canonical_again(*addr); + } + + /* Must avoid aliasing mappings in the highmem code */ + kmap_flush_unused(); + + vm_unmap_aliases(); + + cpa.vaddr = addr; + cpa.pages = pages; + cpa.numpages = numpages; + cpa.mask_set = mask_set; + cpa.mask_clr = mask_clr; + cpa.flags = 0; + cpa.curpage = 0; + cpa.force_split = force_split; + + if (in_flag & (CPA_ARRAY | CPA_PAGES_ARRAY)) + cpa.flags |= in_flag; + + /* No alias checking for _NX bit modifications */ + checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX; + /* Has caller explicitly disabled alias checking? */ + if (in_flag & CPA_NO_CHECK_ALIAS) + checkalias = 0; + + ret = __change_page_attr_set_clr(&cpa, checkalias); + + /* + * Check whether we really changed something: + */ + if (!(cpa.flags & CPA_FLUSHTLB)) + goto out; + + /* + * No need to flush, when we did not set any of the caching + * attributes: + */ + cache = !!pgprot2cachemode(mask_set); + + /* + * On success we use CLFLUSH, when the CPU supports it to + * avoid the WBINVD. If the CPU does not support it and in the + * error case we fall back to cpa_flush_all (which uses + * WBINVD): + */ + if (!ret && boot_cpu_has(X86_FEATURE_CLFLUSH)) { + if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) { + cpa_flush_array(addr, numpages, cache, + cpa.flags, pages); + } else + cpa_flush_range(baddr, numpages, cache); + } else + cpa_flush_all(cache); + +out: + return ret; +} + +static inline int change_page_attr_set(unsigned long *addr, int numpages, + pgprot_t mask, int array) +{ + return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0, + (array ? CPA_ARRAY : 0), NULL); +} + +static inline int change_page_attr_clear(unsigned long *addr, int numpages, + pgprot_t mask, int array) +{ + return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0, + (array ? CPA_ARRAY : 0), NULL); +} + +static inline int cpa_set_pages_array(struct page **pages, int numpages, + pgprot_t mask) +{ + return change_page_attr_set_clr(NULL, numpages, mask, __pgprot(0), 0, + CPA_PAGES_ARRAY, pages); +} + +static inline int cpa_clear_pages_array(struct page **pages, int numpages, + pgprot_t mask) +{ + return change_page_attr_set_clr(NULL, numpages, __pgprot(0), mask, 0, + CPA_PAGES_ARRAY, pages); +} + +int _set_memory_uc(unsigned long addr, int numpages) +{ + /* + * for now UC MINUS. see comments in ioremap_nocache() + * If you really need strong UC use ioremap_uc(), but note + * that you cannot override IO areas with set_memory_*() as + * these helpers cannot work with IO memory. + */ + return change_page_attr_set(&addr, numpages, + cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS), + 0); +} + +int set_memory_uc(unsigned long addr, int numpages) +{ + int ret; + + /* + * for now UC MINUS. see comments in ioremap_nocache() + */ + ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE, + _PAGE_CACHE_MODE_UC_MINUS, NULL); + if (ret) + goto out_err; + + ret = _set_memory_uc(addr, numpages); + if (ret) + goto out_free; + + return 0; + +out_free: + free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE); +out_err: + return ret; +} +EXPORT_SYMBOL(set_memory_uc); + +static int _set_memory_array(unsigned long *addr, int addrinarray, + enum page_cache_mode new_type) +{ + enum page_cache_mode set_type; + int i, j; + int ret; + + for (i = 0; i < addrinarray; i++) { + ret = reserve_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE, + new_type, NULL); + if (ret) + goto out_free; + } + + /* If WC, set to UC- first and then WC */ + set_type = (new_type == _PAGE_CACHE_MODE_WC) ? + _PAGE_CACHE_MODE_UC_MINUS : new_type; + + ret = change_page_attr_set(addr, addrinarray, + cachemode2pgprot(set_type), 1); + + if (!ret && new_type == _PAGE_CACHE_MODE_WC) + ret = change_page_attr_set_clr(addr, addrinarray, + cachemode2pgprot( + _PAGE_CACHE_MODE_WC), + __pgprot(_PAGE_CACHE_MASK), + 0, CPA_ARRAY, NULL); + if (ret) + goto out_free; + + return 0; + +out_free: + for (j = 0; j < i; j++) + free_memtype(__pa(addr[j]), __pa(addr[j]) + PAGE_SIZE); + + return ret; +} + +int set_memory_array_uc(unsigned long *addr, int addrinarray) +{ + return _set_memory_array(addr, addrinarray, _PAGE_CACHE_MODE_UC_MINUS); +} +EXPORT_SYMBOL(set_memory_array_uc); + +int set_memory_array_wc(unsigned long *addr, int addrinarray) +{ + return _set_memory_array(addr, addrinarray, _PAGE_CACHE_MODE_WC); +} +EXPORT_SYMBOL(set_memory_array_wc); + +int set_memory_array_wt(unsigned long *addr, int addrinarray) +{ + return _set_memory_array(addr, addrinarray, _PAGE_CACHE_MODE_WT); +} +EXPORT_SYMBOL_GPL(set_memory_array_wt); + +int _set_memory_wc(unsigned long addr, int numpages) +{ + int ret; + unsigned long addr_copy = addr; + + ret = change_page_attr_set(&addr, numpages, + cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS), + 0); + if (!ret) { + ret = change_page_attr_set_clr(&addr_copy, numpages, + cachemode2pgprot( + _PAGE_CACHE_MODE_WC), + __pgprot(_PAGE_CACHE_MASK), + 0, 0, NULL); + } + return ret; +} + +int set_memory_wc(unsigned long addr, int numpages) +{ + int ret; + + ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE, + _PAGE_CACHE_MODE_WC, NULL); + if (ret) + return ret; + + ret = _set_memory_wc(addr, numpages); + if (ret) + free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE); + + return ret; +} +EXPORT_SYMBOL(set_memory_wc); + +int _set_memory_wt(unsigned long addr, int numpages) +{ + return change_page_attr_set(&addr, numpages, + cachemode2pgprot(_PAGE_CACHE_MODE_WT), 0); +} + +int set_memory_wt(unsigned long addr, int numpages) +{ + int ret; + + ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE, + _PAGE_CACHE_MODE_WT, NULL); + if (ret) + return ret; + + ret = _set_memory_wt(addr, numpages); + if (ret) + free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE); + + return ret; +} +EXPORT_SYMBOL_GPL(set_memory_wt); + +int _set_memory_wb(unsigned long addr, int numpages) +{ + /* WB cache mode is hard wired to all cache attribute bits being 0 */ + return change_page_attr_clear(&addr, numpages, + __pgprot(_PAGE_CACHE_MASK), 0); +} + +int set_memory_wb(unsigned long addr, int numpages) +{ + int ret; + + ret = _set_memory_wb(addr, numpages); + if (ret) + return ret; + + free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE); + return 0; +} +EXPORT_SYMBOL(set_memory_wb); + +int set_memory_array_wb(unsigned long *addr, int addrinarray) +{ + int i; + int ret; + + /* WB cache mode is hard wired to all cache attribute bits being 0 */ + ret = change_page_attr_clear(addr, addrinarray, + __pgprot(_PAGE_CACHE_MASK), 1); + if (ret) + return ret; + + for (i = 0; i < addrinarray; i++) + free_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE); + + return 0; +} +EXPORT_SYMBOL(set_memory_array_wb); + +int set_memory_x(unsigned long addr, int numpages) +{ + if (!(__supported_pte_mask & _PAGE_NX)) + return 0; + + return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_NX), 0); +} +EXPORT_SYMBOL(set_memory_x); + +int set_memory_nx(unsigned long addr, int numpages) +{ + if (!(__supported_pte_mask & _PAGE_NX)) + return 0; + + return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_NX), 0); +} +EXPORT_SYMBOL(set_memory_nx); + +int set_memory_ro(unsigned long addr, int numpages) +{ + return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_RW), 0); +} + +int set_memory_rw(unsigned long addr, int numpages) +{ + return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_RW), 0); +} + +int set_memory_np(unsigned long addr, int numpages) +{ + return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_PRESENT), 0); +} + +int set_memory_np_noalias(unsigned long addr, int numpages) +{ + int cpa_flags = CPA_NO_CHECK_ALIAS; + + return change_page_attr_set_clr(&addr, numpages, __pgprot(0), + __pgprot(_PAGE_PRESENT), 0, + cpa_flags, NULL); +} + +int set_memory_4k(unsigned long addr, int numpages) +{ + return change_page_attr_set_clr(&addr, numpages, __pgprot(0), + __pgprot(0), 1, 0, NULL); +} + +int set_memory_nonglobal(unsigned long addr, int numpages) +{ + return change_page_attr_clear(&addr, numpages, + __pgprot(_PAGE_GLOBAL), 0); +} + +int set_memory_global(unsigned long addr, int numpages) +{ + return change_page_attr_set(&addr, numpages, + __pgprot(_PAGE_GLOBAL), 0); +} + +static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc) +{ + struct cpa_data cpa; + unsigned long start; + int ret; + + /* Nothing to do if memory encryption is not active */ + if (!mem_encrypt_active()) + return 0; + + /* Should not be working on unaligned addresses */ + if (WARN_ONCE(addr & ~PAGE_MASK, "misaligned address: %#lx\n", addr)) + addr &= PAGE_MASK; + + start = addr; + + memset(&cpa, 0, sizeof(cpa)); + cpa.vaddr = &addr; + cpa.numpages = numpages; + cpa.mask_set = enc ? __pgprot(_PAGE_ENC) : __pgprot(0); + cpa.mask_clr = enc ? __pgprot(0) : __pgprot(_PAGE_ENC); + cpa.pgd = init_mm.pgd; + + /* Must avoid aliasing mappings in the highmem code */ + kmap_flush_unused(); + vm_unmap_aliases(); + + /* + * Before changing the encryption attribute, we need to flush caches. + */ + if (static_cpu_has(X86_FEATURE_CLFLUSH)) + cpa_flush_range(start, numpages, 1); + else + cpa_flush_all(1); + + ret = __change_page_attr_set_clr(&cpa, 1); + + /* + * After changing the encryption attribute, we need to flush TLBs + * again in case any speculative TLB caching occurred (but no need + * to flush caches again). We could just use cpa_flush_all(), but + * in case TLB flushing gets optimized in the cpa_flush_range() + * path use the same logic as above. + */ + if (static_cpu_has(X86_FEATURE_CLFLUSH)) + cpa_flush_range(start, numpages, 0); + else + cpa_flush_all(0); + + return ret; +} + +int set_memory_encrypted(unsigned long addr, int numpages) +{ + return __set_memory_enc_dec(addr, numpages, true); +} +EXPORT_SYMBOL_GPL(set_memory_encrypted); + +int set_memory_decrypted(unsigned long addr, int numpages) +{ + return __set_memory_enc_dec(addr, numpages, false); +} +EXPORT_SYMBOL_GPL(set_memory_decrypted); + +int set_pages_uc(struct page *page, int numpages) +{ + unsigned long addr = (unsigned long)page_address(page); + + return set_memory_uc(addr, numpages); +} +EXPORT_SYMBOL(set_pages_uc); + +static int _set_pages_array(struct page **pages, int addrinarray, + enum page_cache_mode new_type) +{ + unsigned long start; + unsigned long end; + enum page_cache_mode set_type; + int i; + int free_idx; + int ret; + + for (i = 0; i < addrinarray; i++) { + if (PageHighMem(pages[i])) + continue; + start = page_to_pfn(pages[i]) << PAGE_SHIFT; + end = start + PAGE_SIZE; + if (reserve_memtype(start, end, new_type, NULL)) + goto err_out; + } + + /* If WC, set to UC- first and then WC */ + set_type = (new_type == _PAGE_CACHE_MODE_WC) ? + _PAGE_CACHE_MODE_UC_MINUS : new_type; + + ret = cpa_set_pages_array(pages, addrinarray, + cachemode2pgprot(set_type)); + if (!ret && new_type == _PAGE_CACHE_MODE_WC) + ret = change_page_attr_set_clr(NULL, addrinarray, + cachemode2pgprot( + _PAGE_CACHE_MODE_WC), + __pgprot(_PAGE_CACHE_MASK), + 0, CPA_PAGES_ARRAY, pages); + if (ret) + goto err_out; + return 0; /* Success */ +err_out: + free_idx = i; + for (i = 0; i < free_idx; i++) { + if (PageHighMem(pages[i])) + continue; + start = page_to_pfn(pages[i]) << PAGE_SHIFT; + end = start + PAGE_SIZE; + free_memtype(start, end); + } + return -EINVAL; +} + +int set_pages_array_uc(struct page **pages, int addrinarray) +{ + return _set_pages_array(pages, addrinarray, _PAGE_CACHE_MODE_UC_MINUS); +} +EXPORT_SYMBOL(set_pages_array_uc); + +int set_pages_array_wc(struct page **pages, int addrinarray) +{ + return _set_pages_array(pages, addrinarray, _PAGE_CACHE_MODE_WC); +} +EXPORT_SYMBOL(set_pages_array_wc); + +int set_pages_array_wt(struct page **pages, int addrinarray) +{ + return _set_pages_array(pages, addrinarray, _PAGE_CACHE_MODE_WT); +} +EXPORT_SYMBOL_GPL(set_pages_array_wt); + +int set_pages_wb(struct page *page, int numpages) +{ + unsigned long addr = (unsigned long)page_address(page); + + return set_memory_wb(addr, numpages); +} +EXPORT_SYMBOL(set_pages_wb); + +int set_pages_array_wb(struct page **pages, int addrinarray) +{ + int retval; + unsigned long start; + unsigned long end; + int i; + + /* WB cache mode is hard wired to all cache attribute bits being 0 */ + retval = cpa_clear_pages_array(pages, addrinarray, + __pgprot(_PAGE_CACHE_MASK)); + if (retval) + return retval; + + for (i = 0; i < addrinarray; i++) { + if (PageHighMem(pages[i])) + continue; + start = page_to_pfn(pages[i]) << PAGE_SHIFT; + end = start + PAGE_SIZE; + free_memtype(start, end); + } + + return 0; +} +EXPORT_SYMBOL(set_pages_array_wb); + +int set_pages_x(struct page *page, int numpages) +{ + unsigned long addr = (unsigned long)page_address(page); + + return set_memory_x(addr, numpages); +} +EXPORT_SYMBOL(set_pages_x); + +int set_pages_nx(struct page *page, int numpages) +{ + unsigned long addr = (unsigned long)page_address(page); + + return set_memory_nx(addr, numpages); +} +EXPORT_SYMBOL(set_pages_nx); + +int set_pages_ro(struct page *page, int numpages) +{ + unsigned long addr = (unsigned long)page_address(page); + + return set_memory_ro(addr, numpages); +} + +int set_pages_rw(struct page *page, int numpages) +{ + unsigned long addr = (unsigned long)page_address(page); + + return set_memory_rw(addr, numpages); +} + +#ifdef CONFIG_DEBUG_PAGEALLOC + +static int __set_pages_p(struct page *page, int numpages) +{ + unsigned long tempaddr = (unsigned long) page_address(page); + struct cpa_data cpa = { .vaddr = &tempaddr, + .pgd = NULL, + .numpages = numpages, + .mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW), + .mask_clr = __pgprot(0), + .flags = 0}; + + /* + * No alias checking needed for setting present flag. otherwise, + * we may need to break large pages for 64-bit kernel text + * mappings (this adds to complexity if we want to do this from + * atomic context especially). Let's keep it simple! + */ + return __change_page_attr_set_clr(&cpa, 0); +} + +static int __set_pages_np(struct page *page, int numpages) +{ + unsigned long tempaddr = (unsigned long) page_address(page); + struct cpa_data cpa = { .vaddr = &tempaddr, + .pgd = NULL, + .numpages = numpages, + .mask_set = __pgprot(0), + .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW), + .flags = 0}; + + /* + * No alias checking needed for setting not present flag. otherwise, + * we may need to break large pages for 64-bit kernel text + * mappings (this adds to complexity if we want to do this from + * atomic context especially). Let's keep it simple! + */ + return __change_page_attr_set_clr(&cpa, 0); +} + +void __kernel_map_pages(struct page *page, int numpages, int enable) +{ + if (PageHighMem(page)) + return; + if (!enable) { + debug_check_no_locks_freed(page_address(page), + numpages * PAGE_SIZE); + } + + /* + * The return value is ignored as the calls cannot fail. + * Large pages for identity mappings are not used at boot time + * and hence no memory allocations during large page split. + */ + if (enable) + __set_pages_p(page, numpages); + else + __set_pages_np(page, numpages); + + /* + * We should perform an IPI and flush all tlbs, + * but that can deadlock->flush only current cpu. + * Preemption needs to be disabled around __flush_tlb_all() due to + * CR3 reload in __native_flush_tlb(). + */ + preempt_disable(); + __flush_tlb_all(); + preempt_enable(); + + arch_flush_lazy_mmu_mode(); +} + +#ifdef CONFIG_HIBERNATION + +bool kernel_page_present(struct page *page) +{ + unsigned int level; + pte_t *pte; + + if (PageHighMem(page)) + return false; + + pte = lookup_address((unsigned long)page_address(page), &level); + return (pte_val(*pte) & _PAGE_PRESENT); +} + +#endif /* CONFIG_HIBERNATION */ + +#endif /* CONFIG_DEBUG_PAGEALLOC */ + +int kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address, + unsigned numpages, unsigned long page_flags) +{ + int retval = -EINVAL; + + struct cpa_data cpa = { + .vaddr = &address, + .pfn = pfn, + .pgd = pgd, + .numpages = numpages, + .mask_set = __pgprot(0), + .mask_clr = __pgprot(~page_flags & (_PAGE_NX|_PAGE_RW)), + .flags = 0, + }; + + if (!(__supported_pte_mask & _PAGE_NX)) + goto out; + + if (!(page_flags & _PAGE_ENC)) + cpa.mask_clr = pgprot_encrypted(cpa.mask_clr); + + cpa.mask_set = __pgprot(_PAGE_PRESENT | page_flags); + + retval = __change_page_attr_set_clr(&cpa, 0); + __flush_tlb_all(); + +out: + return retval; +} + +/* + * The testcases use internal knowledge of the implementation that shouldn't + * be exposed to the rest of the kernel. Include these directly here. + */ +#ifdef CONFIG_CPA_DEBUG +#include "pageattr-test.c" +#endif diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c new file mode 100644 index 000000000..b33304c00 --- /dev/null +++ b/arch/x86/mm/pat.c @@ -0,0 +1,1184 @@ +/* + * Handle caching attributes in page tables (PAT) + * + * Authors: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> + * Suresh B Siddha <suresh.b.siddha@intel.com> + * + * Loosely based on earlier PAT patchset from Eric Biederman and Andi Kleen. + */ + +#include <linux/seq_file.h> +#include <linux/bootmem.h> +#include <linux/debugfs.h> +#include <linux/ioport.h> +#include <linux/kernel.h> +#include <linux/pfn_t.h> +#include <linux/slab.h> +#include <linux/mm.h> +#include <linux/fs.h> +#include <linux/rbtree.h> + +#include <asm/cacheflush.h> +#include <asm/processor.h> +#include <asm/tlbflush.h> +#include <asm/x86_init.h> +#include <asm/pgtable.h> +#include <asm/fcntl.h> +#include <asm/e820/api.h> +#include <asm/mtrr.h> +#include <asm/page.h> +#include <asm/msr.h> +#include <asm/pat.h> +#include <asm/io.h> + +#include "pat_internal.h" +#include "mm_internal.h" + +#undef pr_fmt +#define pr_fmt(fmt) "" fmt + +static bool __read_mostly boot_cpu_done; +static bool __read_mostly pat_disabled = !IS_ENABLED(CONFIG_X86_PAT); +static bool __read_mostly pat_initialized; +static bool __read_mostly init_cm_done; + +void pat_disable(const char *reason) +{ + if (pat_disabled) + return; + + if (boot_cpu_done) { + WARN_ONCE(1, "x86/PAT: PAT cannot be disabled after initialization\n"); + return; + } + + pat_disabled = true; + pr_info("x86/PAT: %s\n", reason); +} + +static int __init nopat(char *str) +{ + pat_disable("PAT support disabled."); + return 0; +} +early_param("nopat", nopat); + +bool pat_enabled(void) +{ + return pat_initialized; +} +EXPORT_SYMBOL_GPL(pat_enabled); + +int pat_debug_enable; + +static int __init pat_debug_setup(char *str) +{ + pat_debug_enable = 1; + return 1; +} +__setup("debugpat", pat_debug_setup); + +#ifdef CONFIG_X86_PAT +/* + * X86 PAT uses page flags arch_1 and uncached together to keep track of + * memory type of pages that have backing page struct. + * + * X86 PAT supports 4 different memory types: + * - _PAGE_CACHE_MODE_WB + * - _PAGE_CACHE_MODE_WC + * - _PAGE_CACHE_MODE_UC_MINUS + * - _PAGE_CACHE_MODE_WT + * + * _PAGE_CACHE_MODE_WB is the default type. + */ + +#define _PGMT_WB 0 +#define _PGMT_WC (1UL << PG_arch_1) +#define _PGMT_UC_MINUS (1UL << PG_uncached) +#define _PGMT_WT (1UL << PG_uncached | 1UL << PG_arch_1) +#define _PGMT_MASK (1UL << PG_uncached | 1UL << PG_arch_1) +#define _PGMT_CLEAR_MASK (~_PGMT_MASK) + +static inline enum page_cache_mode get_page_memtype(struct page *pg) +{ + unsigned long pg_flags = pg->flags & _PGMT_MASK; + + if (pg_flags == _PGMT_WB) + return _PAGE_CACHE_MODE_WB; + else if (pg_flags == _PGMT_WC) + return _PAGE_CACHE_MODE_WC; + else if (pg_flags == _PGMT_UC_MINUS) + return _PAGE_CACHE_MODE_UC_MINUS; + else + return _PAGE_CACHE_MODE_WT; +} + +static inline void set_page_memtype(struct page *pg, + enum page_cache_mode memtype) +{ + unsigned long memtype_flags; + unsigned long old_flags; + unsigned long new_flags; + + switch (memtype) { + case _PAGE_CACHE_MODE_WC: + memtype_flags = _PGMT_WC; + break; + case _PAGE_CACHE_MODE_UC_MINUS: + memtype_flags = _PGMT_UC_MINUS; + break; + case _PAGE_CACHE_MODE_WT: + memtype_flags = _PGMT_WT; + break; + case _PAGE_CACHE_MODE_WB: + default: + memtype_flags = _PGMT_WB; + break; + } + + do { + old_flags = pg->flags; + new_flags = (old_flags & _PGMT_CLEAR_MASK) | memtype_flags; + } while (cmpxchg(&pg->flags, old_flags, new_flags) != old_flags); +} +#else +static inline enum page_cache_mode get_page_memtype(struct page *pg) +{ + return -1; +} +static inline void set_page_memtype(struct page *pg, + enum page_cache_mode memtype) +{ +} +#endif + +enum { + PAT_UC = 0, /* uncached */ + PAT_WC = 1, /* Write combining */ + PAT_WT = 4, /* Write Through */ + PAT_WP = 5, /* Write Protected */ + PAT_WB = 6, /* Write Back (default) */ + PAT_UC_MINUS = 7, /* UC, but can be overridden by MTRR */ +}; + +#define CM(c) (_PAGE_CACHE_MODE_ ## c) + +static enum page_cache_mode pat_get_cache_mode(unsigned pat_val, char *msg) +{ + enum page_cache_mode cache; + char *cache_mode; + + switch (pat_val) { + case PAT_UC: cache = CM(UC); cache_mode = "UC "; break; + case PAT_WC: cache = CM(WC); cache_mode = "WC "; break; + case PAT_WT: cache = CM(WT); cache_mode = "WT "; break; + case PAT_WP: cache = CM(WP); cache_mode = "WP "; break; + case PAT_WB: cache = CM(WB); cache_mode = "WB "; break; + case PAT_UC_MINUS: cache = CM(UC_MINUS); cache_mode = "UC- "; break; + default: cache = CM(WB); cache_mode = "WB "; break; + } + + memcpy(msg, cache_mode, 4); + + return cache; +} + +#undef CM + +/* + * Update the cache mode to pgprot translation tables according to PAT + * configuration. + * Using lower indices is preferred, so we start with highest index. + */ +static void __init_cache_modes(u64 pat) +{ + enum page_cache_mode cache; + char pat_msg[33]; + int i; + + pat_msg[32] = 0; + for (i = 7; i >= 0; i--) { + cache = pat_get_cache_mode((pat >> (i * 8)) & 7, + pat_msg + 4 * i); + update_cache_mode_entry(i, cache); + } + pr_info("x86/PAT: Configuration [0-7]: %s\n", pat_msg); + + init_cm_done = true; +} + +#define PAT(x, y) ((u64)PAT_ ## y << ((x)*8)) + +static void pat_bsp_init(u64 pat) +{ + u64 tmp_pat; + + if (!boot_cpu_has(X86_FEATURE_PAT)) { + pat_disable("PAT not supported by CPU."); + return; + } + + rdmsrl(MSR_IA32_CR_PAT, tmp_pat); + if (!tmp_pat) { + pat_disable("PAT MSR is 0, disabled."); + return; + } + + wrmsrl(MSR_IA32_CR_PAT, pat); + pat_initialized = true; + + __init_cache_modes(pat); +} + +static void pat_ap_init(u64 pat) +{ + if (!boot_cpu_has(X86_FEATURE_PAT)) { + /* + * If this happens we are on a secondary CPU, but switched to + * PAT on the boot CPU. We have no way to undo PAT. + */ + panic("x86/PAT: PAT enabled, but not supported by secondary CPU\n"); + } + + wrmsrl(MSR_IA32_CR_PAT, pat); +} + +void init_cache_modes(void) +{ + u64 pat = 0; + + if (init_cm_done) + return; + + if (boot_cpu_has(X86_FEATURE_PAT)) { + /* + * CPU supports PAT. Set PAT table to be consistent with + * PAT MSR. This case supports "nopat" boot option, and + * virtual machine environments which support PAT without + * MTRRs. In specific, Xen has unique setup to PAT MSR. + * + * If PAT MSR returns 0, it is considered invalid and emulates + * as No PAT. + */ + rdmsrl(MSR_IA32_CR_PAT, pat); + } + + if (!pat) { + /* + * No PAT. Emulate the PAT table that corresponds to the two + * cache bits, PWT (Write Through) and PCD (Cache Disable). + * This setup is also the same as the BIOS default setup. + * + * PTE encoding: + * + * PCD + * |PWT PAT + * || slot + * 00 0 WB : _PAGE_CACHE_MODE_WB + * 01 1 WT : _PAGE_CACHE_MODE_WT + * 10 2 UC-: _PAGE_CACHE_MODE_UC_MINUS + * 11 3 UC : _PAGE_CACHE_MODE_UC + * + * NOTE: When WC or WP is used, it is redirected to UC- per + * the default setup in __cachemode2pte_tbl[]. + */ + pat = PAT(0, WB) | PAT(1, WT) | PAT(2, UC_MINUS) | PAT(3, UC) | + PAT(4, WB) | PAT(5, WT) | PAT(6, UC_MINUS) | PAT(7, UC); + } + + __init_cache_modes(pat); +} + +/** + * pat_init - Initialize PAT MSR and PAT table + * + * This function initializes PAT MSR and PAT table with an OS-defined value + * to enable additional cache attributes, WC, WT and WP. + * + * This function must be called on all CPUs using the specific sequence of + * operations defined in Intel SDM. mtrr_rendezvous_handler() provides this + * procedure for PAT. + */ +void pat_init(void) +{ + u64 pat; + struct cpuinfo_x86 *c = &boot_cpu_data; + + if (pat_disabled) + return; + + if ((c->x86_vendor == X86_VENDOR_INTEL) && + (((c->x86 == 0x6) && (c->x86_model <= 0xd)) || + ((c->x86 == 0xf) && (c->x86_model <= 0x6)))) { + /* + * PAT support with the lower four entries. Intel Pentium 2, + * 3, M, and 4 are affected by PAT errata, which makes the + * upper four entries unusable. To be on the safe side, we don't + * use those. + * + * PTE encoding: + * PAT + * |PCD + * ||PWT PAT + * ||| slot + * 000 0 WB : _PAGE_CACHE_MODE_WB + * 001 1 WC : _PAGE_CACHE_MODE_WC + * 010 2 UC-: _PAGE_CACHE_MODE_UC_MINUS + * 011 3 UC : _PAGE_CACHE_MODE_UC + * PAT bit unused + * + * NOTE: When WT or WP is used, it is redirected to UC- per + * the default setup in __cachemode2pte_tbl[]. + */ + pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) | + PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, UC); + } else { + /* + * Full PAT support. We put WT in slot 7 to improve + * robustness in the presence of errata that might cause + * the high PAT bit to be ignored. This way, a buggy slot 7 + * access will hit slot 3, and slot 3 is UC, so at worst + * we lose performance without causing a correctness issue. + * Pentium 4 erratum N46 is an example for such an erratum, + * although we try not to use PAT at all on affected CPUs. + * + * PTE encoding: + * PAT + * |PCD + * ||PWT PAT + * ||| slot + * 000 0 WB : _PAGE_CACHE_MODE_WB + * 001 1 WC : _PAGE_CACHE_MODE_WC + * 010 2 UC-: _PAGE_CACHE_MODE_UC_MINUS + * 011 3 UC : _PAGE_CACHE_MODE_UC + * 100 4 WB : Reserved + * 101 5 WP : _PAGE_CACHE_MODE_WP + * 110 6 UC-: Reserved + * 111 7 WT : _PAGE_CACHE_MODE_WT + * + * The reserved slots are unused, but mapped to their + * corresponding types in the presence of PAT errata. + */ + pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) | + PAT(4, WB) | PAT(5, WP) | PAT(6, UC_MINUS) | PAT(7, WT); + } + + if (!boot_cpu_done) { + pat_bsp_init(pat); + boot_cpu_done = true; + } else { + pat_ap_init(pat); + } +} + +#undef PAT + +static DEFINE_SPINLOCK(memtype_lock); /* protects memtype accesses */ + +/* + * Does intersection of PAT memory type and MTRR memory type and returns + * the resulting memory type as PAT understands it. + * (Type in pat and mtrr will not have same value) + * The intersection is based on "Effective Memory Type" tables in IA-32 + * SDM vol 3a + */ +static unsigned long pat_x_mtrr_type(u64 start, u64 end, + enum page_cache_mode req_type) +{ + /* + * Look for MTRR hint to get the effective type in case where PAT + * request is for WB. + */ + if (req_type == _PAGE_CACHE_MODE_WB) { + u8 mtrr_type, uniform; + + mtrr_type = mtrr_type_lookup(start, end, &uniform); + if (mtrr_type != MTRR_TYPE_WRBACK) + return _PAGE_CACHE_MODE_UC_MINUS; + + return _PAGE_CACHE_MODE_WB; + } + + return req_type; +} + +struct pagerange_state { + unsigned long cur_pfn; + int ram; + int not_ram; +}; + +static int +pagerange_is_ram_callback(unsigned long initial_pfn, unsigned long total_nr_pages, void *arg) +{ + struct pagerange_state *state = arg; + + state->not_ram |= initial_pfn > state->cur_pfn; + state->ram |= total_nr_pages > 0; + state->cur_pfn = initial_pfn + total_nr_pages; + + return state->ram && state->not_ram; +} + +static int pat_pagerange_is_ram(resource_size_t start, resource_size_t end) +{ + int ret = 0; + unsigned long start_pfn = start >> PAGE_SHIFT; + unsigned long end_pfn = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; + struct pagerange_state state = {start_pfn, 0, 0}; + + /* + * For legacy reasons, physical address range in the legacy ISA + * region is tracked as non-RAM. This will allow users of + * /dev/mem to map portions of legacy ISA region, even when + * some of those portions are listed(or not even listed) with + * different e820 types(RAM/reserved/..) + */ + if (start_pfn < ISA_END_ADDRESS >> PAGE_SHIFT) + start_pfn = ISA_END_ADDRESS >> PAGE_SHIFT; + + if (start_pfn < end_pfn) { + ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn, + &state, pagerange_is_ram_callback); + } + + return (ret > 0) ? -1 : (state.ram ? 1 : 0); +} + +/* + * For RAM pages, we use page flags to mark the pages with appropriate type. + * The page flags are limited to four types, WB (default), WC, WT and UC-. + * WP request fails with -EINVAL, and UC gets redirected to UC-. Setting + * a new memory type is only allowed for a page mapped with the default WB + * type. + * + * Here we do two passes: + * - Find the memtype of all the pages in the range, look for any conflicts. + * - In case of no conflicts, set the new memtype for pages in the range. + */ +static int reserve_ram_pages_type(u64 start, u64 end, + enum page_cache_mode req_type, + enum page_cache_mode *new_type) +{ + struct page *page; + u64 pfn; + + if (req_type == _PAGE_CACHE_MODE_WP) { + if (new_type) + *new_type = _PAGE_CACHE_MODE_UC_MINUS; + return -EINVAL; + } + + if (req_type == _PAGE_CACHE_MODE_UC) { + /* We do not support strong UC */ + WARN_ON_ONCE(1); + req_type = _PAGE_CACHE_MODE_UC_MINUS; + } + + for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) { + enum page_cache_mode type; + + page = pfn_to_page(pfn); + type = get_page_memtype(page); + if (type != _PAGE_CACHE_MODE_WB) { + pr_info("x86/PAT: reserve_ram_pages_type failed [mem %#010Lx-%#010Lx], track 0x%x, req 0x%x\n", + start, end - 1, type, req_type); + if (new_type) + *new_type = type; + + return -EBUSY; + } + } + + if (new_type) + *new_type = req_type; + + for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) { + page = pfn_to_page(pfn); + set_page_memtype(page, req_type); + } + return 0; +} + +static int free_ram_pages_type(u64 start, u64 end) +{ + struct page *page; + u64 pfn; + + for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) { + page = pfn_to_page(pfn); + set_page_memtype(page, _PAGE_CACHE_MODE_WB); + } + return 0; +} + +static u64 sanitize_phys(u64 address) +{ + /* + * When changing the memtype for pages containing poison allow + * for a "decoy" virtual address (bit 63 clear) passed to + * set_memory_X(). __pa() on a "decoy" address results in a + * physical address with bit 63 set. + * + * Decoy addresses are not present for 32-bit builds, see + * set_mce_nospec(). + */ + if (IS_ENABLED(CONFIG_X86_64)) + return address & __PHYSICAL_MASK; + return address; +} + +/* + * req_type typically has one of the: + * - _PAGE_CACHE_MODE_WB + * - _PAGE_CACHE_MODE_WC + * - _PAGE_CACHE_MODE_UC_MINUS + * - _PAGE_CACHE_MODE_UC + * - _PAGE_CACHE_MODE_WT + * + * If new_type is NULL, function will return an error if it cannot reserve the + * region with req_type. If new_type is non-NULL, function will return + * available type in new_type in case of no error. In case of any error + * it will return a negative return value. + */ +int reserve_memtype(u64 start, u64 end, enum page_cache_mode req_type, + enum page_cache_mode *new_type) +{ + struct memtype *new; + enum page_cache_mode actual_type; + int is_range_ram; + int err = 0; + + start = sanitize_phys(start); + end = sanitize_phys(end); + if (start >= end) { + WARN(1, "%s failed: [mem %#010Lx-%#010Lx], req %s\n", __func__, + start, end - 1, cattr_name(req_type)); + return -EINVAL; + } + + if (!pat_enabled()) { + /* This is identical to page table setting without PAT */ + if (new_type) + *new_type = req_type; + return 0; + } + + /* Low ISA region is always mapped WB in page table. No need to track */ + if (x86_platform.is_untracked_pat_range(start, end)) { + if (new_type) + *new_type = _PAGE_CACHE_MODE_WB; + return 0; + } + + /* + * Call mtrr_lookup to get the type hint. This is an + * optimization for /dev/mem mmap'ers into WB memory (BIOS + * tools and ACPI tools). Use WB request for WB memory and use + * UC_MINUS otherwise. + */ + actual_type = pat_x_mtrr_type(start, end, req_type); + + if (new_type) + *new_type = actual_type; + + is_range_ram = pat_pagerange_is_ram(start, end); + if (is_range_ram == 1) { + + err = reserve_ram_pages_type(start, end, req_type, new_type); + + return err; + } else if (is_range_ram < 0) { + return -EINVAL; + } + + new = kzalloc(sizeof(struct memtype), GFP_KERNEL); + if (!new) + return -ENOMEM; + + new->start = start; + new->end = end; + new->type = actual_type; + + spin_lock(&memtype_lock); + + err = rbt_memtype_check_insert(new, new_type); + if (err) { + pr_info("x86/PAT: reserve_memtype failed [mem %#010Lx-%#010Lx], track %s, req %s\n", + start, end - 1, + cattr_name(new->type), cattr_name(req_type)); + kfree(new); + spin_unlock(&memtype_lock); + + return err; + } + + spin_unlock(&memtype_lock); + + dprintk("reserve_memtype added [mem %#010Lx-%#010Lx], track %s, req %s, ret %s\n", + start, end - 1, cattr_name(new->type), cattr_name(req_type), + new_type ? cattr_name(*new_type) : "-"); + + return err; +} + +int free_memtype(u64 start, u64 end) +{ + int err = -EINVAL; + int is_range_ram; + struct memtype *entry; + + if (!pat_enabled()) + return 0; + + start = sanitize_phys(start); + end = sanitize_phys(end); + + /* Low ISA region is always mapped WB. No need to track */ + if (x86_platform.is_untracked_pat_range(start, end)) + return 0; + + is_range_ram = pat_pagerange_is_ram(start, end); + if (is_range_ram == 1) { + + err = free_ram_pages_type(start, end); + + return err; + } else if (is_range_ram < 0) { + return -EINVAL; + } + + spin_lock(&memtype_lock); + entry = rbt_memtype_erase(start, end); + spin_unlock(&memtype_lock); + + if (IS_ERR(entry)) { + pr_info("x86/PAT: %s:%d freeing invalid memtype [mem %#010Lx-%#010Lx]\n", + current->comm, current->pid, start, end - 1); + return -EINVAL; + } + + kfree(entry); + + dprintk("free_memtype request [mem %#010Lx-%#010Lx]\n", start, end - 1); + + return 0; +} + + +/** + * lookup_memtype - Looksup the memory type for a physical address + * @paddr: physical address of which memory type needs to be looked up + * + * Only to be called when PAT is enabled + * + * Returns _PAGE_CACHE_MODE_WB, _PAGE_CACHE_MODE_WC, _PAGE_CACHE_MODE_UC_MINUS + * or _PAGE_CACHE_MODE_WT. + */ +static enum page_cache_mode lookup_memtype(u64 paddr) +{ + enum page_cache_mode rettype = _PAGE_CACHE_MODE_WB; + struct memtype *entry; + + if (x86_platform.is_untracked_pat_range(paddr, paddr + PAGE_SIZE)) + return rettype; + + if (pat_pagerange_is_ram(paddr, paddr + PAGE_SIZE)) { + struct page *page; + + page = pfn_to_page(paddr >> PAGE_SHIFT); + return get_page_memtype(page); + } + + spin_lock(&memtype_lock); + + entry = rbt_memtype_lookup(paddr); + if (entry != NULL) + rettype = entry->type; + else + rettype = _PAGE_CACHE_MODE_UC_MINUS; + + spin_unlock(&memtype_lock); + return rettype; +} + +/** + * pat_pfn_immune_to_uc_mtrr - Check whether the PAT memory type + * of @pfn cannot be overridden by UC MTRR memory type. + * + * Only to be called when PAT is enabled. + * + * Returns true, if the PAT memory type of @pfn is UC, UC-, or WC. + * Returns false in other cases. + */ +bool pat_pfn_immune_to_uc_mtrr(unsigned long pfn) +{ + enum page_cache_mode cm = lookup_memtype(PFN_PHYS(pfn)); + + return cm == _PAGE_CACHE_MODE_UC || + cm == _PAGE_CACHE_MODE_UC_MINUS || + cm == _PAGE_CACHE_MODE_WC; +} +EXPORT_SYMBOL_GPL(pat_pfn_immune_to_uc_mtrr); + +/** + * io_reserve_memtype - Request a memory type mapping for a region of memory + * @start: start (physical address) of the region + * @end: end (physical address) of the region + * @type: A pointer to memtype, with requested type. On success, requested + * or any other compatible type that was available for the region is returned + * + * On success, returns 0 + * On failure, returns non-zero + */ +int io_reserve_memtype(resource_size_t start, resource_size_t end, + enum page_cache_mode *type) +{ + resource_size_t size = end - start; + enum page_cache_mode req_type = *type; + enum page_cache_mode new_type; + int ret; + + WARN_ON_ONCE(iomem_map_sanity_check(start, size)); + + ret = reserve_memtype(start, end, req_type, &new_type); + if (ret) + goto out_err; + + if (!is_new_memtype_allowed(start, size, req_type, new_type)) + goto out_free; + + if (kernel_map_sync_memtype(start, size, new_type) < 0) + goto out_free; + + *type = new_type; + return 0; + +out_free: + free_memtype(start, end); + ret = -EBUSY; +out_err: + return ret; +} + +/** + * io_free_memtype - Release a memory type mapping for a region of memory + * @start: start (physical address) of the region + * @end: end (physical address) of the region + */ +void io_free_memtype(resource_size_t start, resource_size_t end) +{ + free_memtype(start, end); +} + +int arch_io_reserve_memtype_wc(resource_size_t start, resource_size_t size) +{ + enum page_cache_mode type = _PAGE_CACHE_MODE_WC; + + return io_reserve_memtype(start, start + size, &type); +} +EXPORT_SYMBOL(arch_io_reserve_memtype_wc); + +void arch_io_free_memtype_wc(resource_size_t start, resource_size_t size) +{ + io_free_memtype(start, start + size); +} +EXPORT_SYMBOL(arch_io_free_memtype_wc); + +pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, + unsigned long size, pgprot_t vma_prot) +{ + if (!phys_mem_access_encrypted(pfn << PAGE_SHIFT, size)) + vma_prot = pgprot_decrypted(vma_prot); + + return vma_prot; +} + +#ifdef CONFIG_STRICT_DEVMEM +/* This check is done in drivers/char/mem.c in case of STRICT_DEVMEM */ +static inline int range_is_allowed(unsigned long pfn, unsigned long size) +{ + return 1; +} +#else +/* This check is needed to avoid cache aliasing when PAT is enabled */ +static inline int range_is_allowed(unsigned long pfn, unsigned long size) +{ + u64 from = ((u64)pfn) << PAGE_SHIFT; + u64 to = from + size; + u64 cursor = from; + + if (!pat_enabled()) + return 1; + + while (cursor < to) { + if (!devmem_is_allowed(pfn)) + return 0; + cursor += PAGE_SIZE; + pfn++; + } + return 1; +} +#endif /* CONFIG_STRICT_DEVMEM */ + +int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, + unsigned long size, pgprot_t *vma_prot) +{ + enum page_cache_mode pcm = _PAGE_CACHE_MODE_WB; + + if (!range_is_allowed(pfn, size)) + return 0; + + if (file->f_flags & O_DSYNC) + pcm = _PAGE_CACHE_MODE_UC_MINUS; + + *vma_prot = __pgprot((pgprot_val(*vma_prot) & ~_PAGE_CACHE_MASK) | + cachemode2protval(pcm)); + return 1; +} + +/* + * Change the memory type for the physial address range in kernel identity + * mapping space if that range is a part of identity map. + */ +int kernel_map_sync_memtype(u64 base, unsigned long size, + enum page_cache_mode pcm) +{ + unsigned long id_sz; + + if (base > __pa(high_memory-1)) + return 0; + + /* + * some areas in the middle of the kernel identity range + * are not mapped, like the PCI space. + */ + if (!page_is_ram(base >> PAGE_SHIFT)) + return 0; + + id_sz = (__pa(high_memory-1) <= base + size) ? + __pa(high_memory) - base : + size; + + if (ioremap_change_attr((unsigned long)__va(base), id_sz, pcm) < 0) { + pr_info("x86/PAT: %s:%d ioremap_change_attr failed %s for [mem %#010Lx-%#010Lx]\n", + current->comm, current->pid, + cattr_name(pcm), + base, (unsigned long long)(base + size-1)); + return -EINVAL; + } + return 0; +} + +/* + * Internal interface to reserve a range of physical memory with prot. + * Reserved non RAM regions only and after successful reserve_memtype, + * this func also keeps identity mapping (if any) in sync with this new prot. + */ +static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot, + int strict_prot) +{ + int is_ram = 0; + int ret; + enum page_cache_mode want_pcm = pgprot2cachemode(*vma_prot); + enum page_cache_mode pcm = want_pcm; + + is_ram = pat_pagerange_is_ram(paddr, paddr + size); + + /* + * reserve_pfn_range() for RAM pages. We do not refcount to keep + * track of number of mappings of RAM pages. We can assert that + * the type requested matches the type of first page in the range. + */ + if (is_ram) { + if (!pat_enabled()) + return 0; + + pcm = lookup_memtype(paddr); + if (want_pcm != pcm) { + pr_warn("x86/PAT: %s:%d map pfn RAM range req %s for [mem %#010Lx-%#010Lx], got %s\n", + current->comm, current->pid, + cattr_name(want_pcm), + (unsigned long long)paddr, + (unsigned long long)(paddr + size - 1), + cattr_name(pcm)); + *vma_prot = __pgprot((pgprot_val(*vma_prot) & + (~_PAGE_CACHE_MASK)) | + cachemode2protval(pcm)); + } + return 0; + } + + ret = reserve_memtype(paddr, paddr + size, want_pcm, &pcm); + if (ret) + return ret; + + if (pcm != want_pcm) { + if (strict_prot || + !is_new_memtype_allowed(paddr, size, want_pcm, pcm)) { + free_memtype(paddr, paddr + size); + pr_err("x86/PAT: %s:%d map pfn expected mapping type %s for [mem %#010Lx-%#010Lx], got %s\n", + current->comm, current->pid, + cattr_name(want_pcm), + (unsigned long long)paddr, + (unsigned long long)(paddr + size - 1), + cattr_name(pcm)); + return -EINVAL; + } + /* + * We allow returning different type than the one requested in + * non strict case. + */ + *vma_prot = __pgprot((pgprot_val(*vma_prot) & + (~_PAGE_CACHE_MASK)) | + cachemode2protval(pcm)); + } + + if (kernel_map_sync_memtype(paddr, size, pcm) < 0) { + free_memtype(paddr, paddr + size); + return -EINVAL; + } + return 0; +} + +/* + * Internal interface to free a range of physical memory. + * Frees non RAM regions only. + */ +static void free_pfn_range(u64 paddr, unsigned long size) +{ + int is_ram; + + is_ram = pat_pagerange_is_ram(paddr, paddr + size); + if (is_ram == 0) + free_memtype(paddr, paddr + size); +} + +/* + * track_pfn_copy is called when vma that is covering the pfnmap gets + * copied through copy_page_range(). + * + * If the vma has a linear pfn mapping for the entire range, we get the prot + * from pte and reserve the entire vma range with single reserve_pfn_range call. + */ +int track_pfn_copy(struct vm_area_struct *vma) +{ + resource_size_t paddr; + unsigned long prot; + unsigned long vma_size = vma->vm_end - vma->vm_start; + pgprot_t pgprot; + + if (vma->vm_flags & VM_PAT) { + /* + * reserve the whole chunk covered by vma. We need the + * starting address and protection from pte. + */ + if (follow_phys(vma, vma->vm_start, 0, &prot, &paddr)) { + WARN_ON_ONCE(1); + return -EINVAL; + } + pgprot = __pgprot(prot); + return reserve_pfn_range(paddr, vma_size, &pgprot, 1); + } + + return 0; +} + +/* + * prot is passed in as a parameter for the new mapping. If the vma has + * a linear pfn mapping for the entire range, or no vma is provided, + * reserve the entire pfn + size range with single reserve_pfn_range + * call. + */ +int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, + unsigned long pfn, unsigned long addr, unsigned long size) +{ + resource_size_t paddr = (resource_size_t)pfn << PAGE_SHIFT; + enum page_cache_mode pcm; + + /* reserve the whole chunk starting from paddr */ + if (!vma || (addr == vma->vm_start + && size == (vma->vm_end - vma->vm_start))) { + int ret; + + ret = reserve_pfn_range(paddr, size, prot, 0); + if (ret == 0 && vma) + vma->vm_flags |= VM_PAT; + return ret; + } + + if (!pat_enabled()) + return 0; + + /* + * For anything smaller than the vma size we set prot based on the + * lookup. + */ + pcm = lookup_memtype(paddr); + + /* Check memtype for the remaining pages */ + while (size > PAGE_SIZE) { + size -= PAGE_SIZE; + paddr += PAGE_SIZE; + if (pcm != lookup_memtype(paddr)) + return -EINVAL; + } + + *prot = __pgprot((pgprot_val(*prot) & (~_PAGE_CACHE_MASK)) | + cachemode2protval(pcm)); + + return 0; +} + +void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, pfn_t pfn) +{ + enum page_cache_mode pcm; + + if (!pat_enabled()) + return; + + /* Set prot based on lookup */ + pcm = lookup_memtype(pfn_t_to_phys(pfn)); + *prot = __pgprot((pgprot_val(*prot) & (~_PAGE_CACHE_MASK)) | + cachemode2protval(pcm)); +} + +/* + * untrack_pfn is called while unmapping a pfnmap for a region. + * untrack can be called for a specific region indicated by pfn and size or + * can be for the entire vma (in which case pfn, size are zero). + */ +void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, + unsigned long size) +{ + resource_size_t paddr; + unsigned long prot; + + if (vma && !(vma->vm_flags & VM_PAT)) + return; + + /* free the chunk starting from pfn or the whole chunk */ + paddr = (resource_size_t)pfn << PAGE_SHIFT; + if (!paddr && !size) { + if (follow_phys(vma, vma->vm_start, 0, &prot, &paddr)) { + WARN_ON_ONCE(1); + return; + } + + size = vma->vm_end - vma->vm_start; + } + free_pfn_range(paddr, size); + if (vma) + vma->vm_flags &= ~VM_PAT; +} + +/* + * untrack_pfn_moved is called, while mremapping a pfnmap for a new region, + * with the old vma after its pfnmap page table has been removed. The new + * vma has a new pfnmap to the same pfn & cache type with VM_PAT set. + */ +void untrack_pfn_moved(struct vm_area_struct *vma) +{ + vma->vm_flags &= ~VM_PAT; +} + +pgprot_t pgprot_writecombine(pgprot_t prot) +{ + return __pgprot(pgprot_val(prot) | + cachemode2protval(_PAGE_CACHE_MODE_WC)); +} +EXPORT_SYMBOL_GPL(pgprot_writecombine); + +pgprot_t pgprot_writethrough(pgprot_t prot) +{ + return __pgprot(pgprot_val(prot) | + cachemode2protval(_PAGE_CACHE_MODE_WT)); +} +EXPORT_SYMBOL_GPL(pgprot_writethrough); + +#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_X86_PAT) + +static struct memtype *memtype_get_idx(loff_t pos) +{ + struct memtype *print_entry; + int ret; + + print_entry = kzalloc(sizeof(struct memtype), GFP_KERNEL); + if (!print_entry) + return NULL; + + spin_lock(&memtype_lock); + ret = rbt_memtype_copy_nth_element(print_entry, pos); + spin_unlock(&memtype_lock); + + if (!ret) { + return print_entry; + } else { + kfree(print_entry); + return NULL; + } +} + +static void *memtype_seq_start(struct seq_file *seq, loff_t *pos) +{ + if (*pos == 0) { + ++*pos; + seq_puts(seq, "PAT memtype list:\n"); + } + + return memtype_get_idx(*pos); +} + +static void *memtype_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + kfree(v); + ++*pos; + return memtype_get_idx(*pos); +} + +static void memtype_seq_stop(struct seq_file *seq, void *v) +{ + kfree(v); +} + +static int memtype_seq_show(struct seq_file *seq, void *v) +{ + struct memtype *print_entry = (struct memtype *)v; + + seq_printf(seq, "%s @ 0x%Lx-0x%Lx\n", cattr_name(print_entry->type), + print_entry->start, print_entry->end); + + return 0; +} + +static const struct seq_operations memtype_seq_ops = { + .start = memtype_seq_start, + .next = memtype_seq_next, + .stop = memtype_seq_stop, + .show = memtype_seq_show, +}; + +static int memtype_seq_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &memtype_seq_ops); +} + +static const struct file_operations memtype_fops = { + .open = memtype_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int __init pat_memtype_list_init(void) +{ + if (pat_enabled()) { + debugfs_create_file("pat_memtype_list", S_IRUSR, + arch_debugfs_dir, NULL, &memtype_fops); + } + return 0; +} + +late_initcall(pat_memtype_list_init); + +#endif /* CONFIG_DEBUG_FS && CONFIG_X86_PAT */ diff --git a/arch/x86/mm/pat_internal.h b/arch/x86/mm/pat_internal.h new file mode 100644 index 000000000..eeb5caeb0 --- /dev/null +++ b/arch/x86/mm/pat_internal.h @@ -0,0 +1,49 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __PAT_INTERNAL_H_ +#define __PAT_INTERNAL_H_ + +extern int pat_debug_enable; + +#define dprintk(fmt, arg...) \ + do { if (pat_debug_enable) pr_info("x86/PAT: " fmt, ##arg); } while (0) + +struct memtype { + u64 start; + u64 end; + u64 subtree_max_end; + enum page_cache_mode type; + struct rb_node rb; +}; + +static inline char *cattr_name(enum page_cache_mode pcm) +{ + switch (pcm) { + case _PAGE_CACHE_MODE_UC: return "uncached"; + case _PAGE_CACHE_MODE_UC_MINUS: return "uncached-minus"; + case _PAGE_CACHE_MODE_WB: return "write-back"; + case _PAGE_CACHE_MODE_WC: return "write-combining"; + case _PAGE_CACHE_MODE_WT: return "write-through"; + case _PAGE_CACHE_MODE_WP: return "write-protected"; + default: return "broken"; + } +} + +#ifdef CONFIG_X86_PAT +extern int rbt_memtype_check_insert(struct memtype *new, + enum page_cache_mode *new_type); +extern struct memtype *rbt_memtype_erase(u64 start, u64 end); +extern struct memtype *rbt_memtype_lookup(u64 addr); +extern int rbt_memtype_copy_nth_element(struct memtype *out, loff_t pos); +#else +static inline int rbt_memtype_check_insert(struct memtype *new, + enum page_cache_mode *new_type) +{ return 0; } +static inline struct memtype *rbt_memtype_erase(u64 start, u64 end) +{ return NULL; } +static inline struct memtype *rbt_memtype_lookup(u64 addr) +{ return NULL; } +static inline int rbt_memtype_copy_nth_element(struct memtype *out, loff_t pos) +{ return 0; } +#endif + +#endif /* __PAT_INTERNAL_H_ */ diff --git a/arch/x86/mm/pat_rbtree.c b/arch/x86/mm/pat_rbtree.c new file mode 100644 index 000000000..fa16036fa --- /dev/null +++ b/arch/x86/mm/pat_rbtree.c @@ -0,0 +1,281 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Handle caching attributes in page tables (PAT) + * + * Authors: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> + * Suresh B Siddha <suresh.b.siddha@intel.com> + * + * Interval tree (augmented rbtree) used to store the PAT memory type + * reservations. + */ + +#include <linux/seq_file.h> +#include <linux/debugfs.h> +#include <linux/kernel.h> +#include <linux/rbtree_augmented.h> +#include <linux/sched.h> +#include <linux/gfp.h> + +#include <asm/pgtable.h> +#include <asm/pat.h> + +#include "pat_internal.h" + +/* + * The memtype tree keeps track of memory type for specific + * physical memory areas. Without proper tracking, conflicting memory + * types in different mappings can cause CPU cache corruption. + * + * The tree is an interval tree (augmented rbtree) with tree ordered + * on starting address. Tree can contain multiple entries for + * different regions which overlap. All the aliases have the same + * cache attributes of course. + * + * memtype_lock protects the rbtree. + */ + +static struct rb_root memtype_rbroot = RB_ROOT; + +static int is_node_overlap(struct memtype *node, u64 start, u64 end) +{ + if (node->start >= end || node->end <= start) + return 0; + + return 1; +} + +static u64 get_subtree_max_end(struct rb_node *node) +{ + u64 ret = 0; + if (node) { + struct memtype *data = rb_entry(node, struct memtype, rb); + ret = data->subtree_max_end; + } + return ret; +} + +static u64 compute_subtree_max_end(struct memtype *data) +{ + u64 max_end = data->end, child_max_end; + + child_max_end = get_subtree_max_end(data->rb.rb_right); + if (child_max_end > max_end) + max_end = child_max_end; + + child_max_end = get_subtree_max_end(data->rb.rb_left); + if (child_max_end > max_end) + max_end = child_max_end; + + return max_end; +} + +RB_DECLARE_CALLBACKS(static, memtype_rb_augment_cb, struct memtype, rb, + u64, subtree_max_end, compute_subtree_max_end) + +/* Find the first (lowest start addr) overlapping range from rb tree */ +static struct memtype *memtype_rb_lowest_match(struct rb_root *root, + u64 start, u64 end) +{ + struct rb_node *node = root->rb_node; + struct memtype *last_lower = NULL; + + while (node) { + struct memtype *data = rb_entry(node, struct memtype, rb); + + if (get_subtree_max_end(node->rb_left) > start) { + /* Lowest overlap if any must be on left side */ + node = node->rb_left; + } else if (is_node_overlap(data, start, end)) { + last_lower = data; + break; + } else if (start >= data->start) { + /* Lowest overlap if any must be on right side */ + node = node->rb_right; + } else { + break; + } + } + return last_lower; /* Returns NULL if there is no overlap */ +} + +enum { + MEMTYPE_EXACT_MATCH = 0, + MEMTYPE_END_MATCH = 1 +}; + +static struct memtype *memtype_rb_match(struct rb_root *root, + u64 start, u64 end, int match_type) +{ + struct memtype *match; + + match = memtype_rb_lowest_match(root, start, end); + while (match != NULL && match->start < end) { + struct rb_node *node; + + if ((match_type == MEMTYPE_EXACT_MATCH) && + (match->start == start) && (match->end == end)) + return match; + + if ((match_type == MEMTYPE_END_MATCH) && + (match->start < start) && (match->end == end)) + return match; + + node = rb_next(&match->rb); + if (node) + match = rb_entry(node, struct memtype, rb); + else + match = NULL; + } + + return NULL; /* Returns NULL if there is no match */ +} + +static int memtype_rb_check_conflict(struct rb_root *root, + u64 start, u64 end, + enum page_cache_mode reqtype, + enum page_cache_mode *newtype) +{ + struct rb_node *node; + struct memtype *match; + enum page_cache_mode found_type = reqtype; + + match = memtype_rb_lowest_match(&memtype_rbroot, start, end); + if (match == NULL) + goto success; + + if (match->type != found_type && newtype == NULL) + goto failure; + + dprintk("Overlap at 0x%Lx-0x%Lx\n", match->start, match->end); + found_type = match->type; + + node = rb_next(&match->rb); + while (node) { + match = rb_entry(node, struct memtype, rb); + + if (match->start >= end) /* Checked all possible matches */ + goto success; + + if (is_node_overlap(match, start, end) && + match->type != found_type) { + goto failure; + } + + node = rb_next(&match->rb); + } +success: + if (newtype) + *newtype = found_type; + + return 0; + +failure: + pr_info("x86/PAT: %s:%d conflicting memory types %Lx-%Lx %s<->%s\n", + current->comm, current->pid, start, end, + cattr_name(found_type), cattr_name(match->type)); + return -EBUSY; +} + +static void memtype_rb_insert(struct rb_root *root, struct memtype *newdata) +{ + struct rb_node **node = &(root->rb_node); + struct rb_node *parent = NULL; + + while (*node) { + struct memtype *data = rb_entry(*node, struct memtype, rb); + + parent = *node; + if (data->subtree_max_end < newdata->end) + data->subtree_max_end = newdata->end; + if (newdata->start <= data->start) + node = &((*node)->rb_left); + else if (newdata->start > data->start) + node = &((*node)->rb_right); + } + + newdata->subtree_max_end = newdata->end; + rb_link_node(&newdata->rb, parent, node); + rb_insert_augmented(&newdata->rb, root, &memtype_rb_augment_cb); +} + +int rbt_memtype_check_insert(struct memtype *new, + enum page_cache_mode *ret_type) +{ + int err = 0; + + err = memtype_rb_check_conflict(&memtype_rbroot, new->start, new->end, + new->type, ret_type); + + if (!err) { + if (ret_type) + new->type = *ret_type; + + new->subtree_max_end = new->end; + memtype_rb_insert(&memtype_rbroot, new); + } + return err; +} + +struct memtype *rbt_memtype_erase(u64 start, u64 end) +{ + struct memtype *data; + + /* + * Since the memtype_rbroot tree allows overlapping ranges, + * rbt_memtype_erase() checks with EXACT_MATCH first, i.e. free + * a whole node for the munmap case. If no such entry is found, + * it then checks with END_MATCH, i.e. shrink the size of a node + * from the end for the mremap case. + */ + data = memtype_rb_match(&memtype_rbroot, start, end, + MEMTYPE_EXACT_MATCH); + if (!data) { + data = memtype_rb_match(&memtype_rbroot, start, end, + MEMTYPE_END_MATCH); + if (!data) + return ERR_PTR(-EINVAL); + } + + if (data->start == start) { + /* munmap: erase this node */ + rb_erase_augmented(&data->rb, &memtype_rbroot, + &memtype_rb_augment_cb); + } else { + /* mremap: update the end value of this node */ + rb_erase_augmented(&data->rb, &memtype_rbroot, + &memtype_rb_augment_cb); + data->end = start; + data->subtree_max_end = data->end; + memtype_rb_insert(&memtype_rbroot, data); + return NULL; + } + + return data; +} + +struct memtype *rbt_memtype_lookup(u64 addr) +{ + return memtype_rb_lowest_match(&memtype_rbroot, addr, addr + PAGE_SIZE); +} + +#if defined(CONFIG_DEBUG_FS) +int rbt_memtype_copy_nth_element(struct memtype *out, loff_t pos) +{ + struct rb_node *node; + int i = 1; + + node = rb_first(&memtype_rbroot); + while (node && pos != i) { + node = rb_next(node); + i++; + } + + if (node) { /* pos == i */ + struct memtype *this = rb_entry(node, struct memtype, rb); + *out = *this; + return 0; + } else { + return 1; + } +} +#endif diff --git a/arch/x86/mm/pf_in.c b/arch/x86/mm/pf_in.c new file mode 100644 index 000000000..a23586953 --- /dev/null +++ b/arch/x86/mm/pf_in.c @@ -0,0 +1,531 @@ +/* + * Fault Injection Test harness (FI) + * Copyright (C) Intel Crop. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, + * USA. + * + */ + +/* Id: pf_in.c,v 1.1.1.1 2002/11/12 05:56:32 brlock Exp + * Copyright by Intel Crop., 2002 + * Louis Zhuang (louis.zhuang@intel.com) + * + * Bjorn Steinbrink (B.Steinbrink@gmx.de), 2007 + */ + +#include <linux/ptrace.h> /* struct pt_regs */ +#include "pf_in.h" + +#ifdef __i386__ +/* IA32 Manual 3, 2-1 */ +static unsigned char prefix_codes[] = { + 0xF0, 0xF2, 0xF3, 0x2E, 0x36, 0x3E, 0x26, 0x64, + 0x65, 0x66, 0x67 +}; +/* IA32 Manual 3, 3-432*/ +static unsigned int reg_rop[] = { + 0x8A, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F +}; +static unsigned int reg_wop[] = { 0x88, 0x89, 0xAA, 0xAB }; +static unsigned int imm_wop[] = { 0xC6, 0xC7 }; +/* IA32 Manual 3, 3-432*/ +static unsigned int rw8[] = { 0x88, 0x8A, 0xC6, 0xAA }; +static unsigned int rw32[] = { + 0x89, 0x8B, 0xC7, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F, 0xAB +}; +static unsigned int mw8[] = { 0x88, 0x8A, 0xC6, 0xB60F, 0xBE0F, 0xAA }; +static unsigned int mw16[] = { 0xB70F, 0xBF0F }; +static unsigned int mw32[] = { 0x89, 0x8B, 0xC7, 0xAB }; +static unsigned int mw64[] = {}; +#else /* not __i386__ */ +static unsigned char prefix_codes[] = { + 0x66, 0x67, 0x2E, 0x3E, 0x26, 0x64, 0x65, 0x36, + 0xF0, 0xF3, 0xF2, + /* REX Prefixes */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f +}; +/* AMD64 Manual 3, Appendix A*/ +static unsigned int reg_rop[] = { + 0x8A, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F +}; +static unsigned int reg_wop[] = { 0x88, 0x89, 0xAA, 0xAB }; +static unsigned int imm_wop[] = { 0xC6, 0xC7 }; +static unsigned int rw8[] = { 0xC6, 0x88, 0x8A, 0xAA }; +static unsigned int rw32[] = { + 0xC7, 0x89, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F, 0xAB +}; +/* 8 bit only */ +static unsigned int mw8[] = { 0xC6, 0x88, 0x8A, 0xB60F, 0xBE0F, 0xAA }; +/* 16 bit only */ +static unsigned int mw16[] = { 0xB70F, 0xBF0F }; +/* 16 or 32 bit */ +static unsigned int mw32[] = { 0xC7 }; +/* 16, 32 or 64 bit */ +static unsigned int mw64[] = { 0x89, 0x8B, 0xAB }; +#endif /* not __i386__ */ + +struct prefix_bits { + unsigned shorted:1; + unsigned enlarged:1; + unsigned rexr:1; + unsigned rex:1; +}; + +static int skip_prefix(unsigned char *addr, struct prefix_bits *prf) +{ + int i; + unsigned char *p = addr; + prf->shorted = 0; + prf->enlarged = 0; + prf->rexr = 0; + prf->rex = 0; + +restart: + for (i = 0; i < ARRAY_SIZE(prefix_codes); i++) { + if (*p == prefix_codes[i]) { + if (*p == 0x66) + prf->shorted = 1; +#ifdef __amd64__ + if ((*p & 0xf8) == 0x48) + prf->enlarged = 1; + if ((*p & 0xf4) == 0x44) + prf->rexr = 1; + if ((*p & 0xf0) == 0x40) + prf->rex = 1; +#endif + p++; + goto restart; + } + } + + return (p - addr); +} + +static int get_opcode(unsigned char *addr, unsigned int *opcode) +{ + int len; + + if (*addr == 0x0F) { + /* 0x0F is extension instruction */ + *opcode = *(unsigned short *)addr; + len = 2; + } else { + *opcode = *addr; + len = 1; + } + + return len; +} + +#define CHECK_OP_TYPE(opcode, array, type) \ + for (i = 0; i < ARRAY_SIZE(array); i++) { \ + if (array[i] == opcode) { \ + rv = type; \ + goto exit; \ + } \ + } + +enum reason_type get_ins_type(unsigned long ins_addr) +{ + unsigned int opcode; + unsigned char *p; + struct prefix_bits prf; + int i; + enum reason_type rv = OTHERS; + + p = (unsigned char *)ins_addr; + p += skip_prefix(p, &prf); + p += get_opcode(p, &opcode); + + CHECK_OP_TYPE(opcode, reg_rop, REG_READ); + CHECK_OP_TYPE(opcode, reg_wop, REG_WRITE); + CHECK_OP_TYPE(opcode, imm_wop, IMM_WRITE); + +exit: + return rv; +} +#undef CHECK_OP_TYPE + +static unsigned int get_ins_reg_width(unsigned long ins_addr) +{ + unsigned int opcode; + unsigned char *p; + struct prefix_bits prf; + int i; + + p = (unsigned char *)ins_addr; + p += skip_prefix(p, &prf); + p += get_opcode(p, &opcode); + + for (i = 0; i < ARRAY_SIZE(rw8); i++) + if (rw8[i] == opcode) + return 1; + + for (i = 0; i < ARRAY_SIZE(rw32); i++) + if (rw32[i] == opcode) + return prf.shorted ? 2 : (prf.enlarged ? 8 : 4); + + printk(KERN_ERR "mmiotrace: Unknown opcode 0x%02x\n", opcode); + return 0; +} + +unsigned int get_ins_mem_width(unsigned long ins_addr) +{ + unsigned int opcode; + unsigned char *p; + struct prefix_bits prf; + int i; + + p = (unsigned char *)ins_addr; + p += skip_prefix(p, &prf); + p += get_opcode(p, &opcode); + + for (i = 0; i < ARRAY_SIZE(mw8); i++) + if (mw8[i] == opcode) + return 1; + + for (i = 0; i < ARRAY_SIZE(mw16); i++) + if (mw16[i] == opcode) + return 2; + + for (i = 0; i < ARRAY_SIZE(mw32); i++) + if (mw32[i] == opcode) + return prf.shorted ? 2 : 4; + + for (i = 0; i < ARRAY_SIZE(mw64); i++) + if (mw64[i] == opcode) + return prf.shorted ? 2 : (prf.enlarged ? 8 : 4); + + printk(KERN_ERR "mmiotrace: Unknown opcode 0x%02x\n", opcode); + return 0; +} + +/* + * Define register ident in mod/rm byte. + * Note: these are NOT the same as in ptrace-abi.h. + */ +enum { + arg_AL = 0, + arg_CL = 1, + arg_DL = 2, + arg_BL = 3, + arg_AH = 4, + arg_CH = 5, + arg_DH = 6, + arg_BH = 7, + + arg_AX = 0, + arg_CX = 1, + arg_DX = 2, + arg_BX = 3, + arg_SP = 4, + arg_BP = 5, + arg_SI = 6, + arg_DI = 7, +#ifdef __amd64__ + arg_R8 = 8, + arg_R9 = 9, + arg_R10 = 10, + arg_R11 = 11, + arg_R12 = 12, + arg_R13 = 13, + arg_R14 = 14, + arg_R15 = 15 +#endif +}; + +static unsigned char *get_reg_w8(int no, int rex, struct pt_regs *regs) +{ + unsigned char *rv = NULL; + + switch (no) { + case arg_AL: + rv = (unsigned char *)®s->ax; + break; + case arg_BL: + rv = (unsigned char *)®s->bx; + break; + case arg_CL: + rv = (unsigned char *)®s->cx; + break; + case arg_DL: + rv = (unsigned char *)®s->dx; + break; +#ifdef __amd64__ + case arg_R8: + rv = (unsigned char *)®s->r8; + break; + case arg_R9: + rv = (unsigned char *)®s->r9; + break; + case arg_R10: + rv = (unsigned char *)®s->r10; + break; + case arg_R11: + rv = (unsigned char *)®s->r11; + break; + case arg_R12: + rv = (unsigned char *)®s->r12; + break; + case arg_R13: + rv = (unsigned char *)®s->r13; + break; + case arg_R14: + rv = (unsigned char *)®s->r14; + break; + case arg_R15: + rv = (unsigned char *)®s->r15; + break; +#endif + default: + break; + } + + if (rv) + return rv; + + if (rex) { + /* + * If REX prefix exists, access low bytes of SI etc. + * instead of AH etc. + */ + switch (no) { + case arg_SI: + rv = (unsigned char *)®s->si; + break; + case arg_DI: + rv = (unsigned char *)®s->di; + break; + case arg_BP: + rv = (unsigned char *)®s->bp; + break; + case arg_SP: + rv = (unsigned char *)®s->sp; + break; + default: + break; + } + } else { + switch (no) { + case arg_AH: + rv = 1 + (unsigned char *)®s->ax; + break; + case arg_BH: + rv = 1 + (unsigned char *)®s->bx; + break; + case arg_CH: + rv = 1 + (unsigned char *)®s->cx; + break; + case arg_DH: + rv = 1 + (unsigned char *)®s->dx; + break; + default: + break; + } + } + + if (!rv) + printk(KERN_ERR "mmiotrace: Error reg no# %d\n", no); + + return rv; +} + +static unsigned long *get_reg_w32(int no, struct pt_regs *regs) +{ + unsigned long *rv = NULL; + + switch (no) { + case arg_AX: + rv = ®s->ax; + break; + case arg_BX: + rv = ®s->bx; + break; + case arg_CX: + rv = ®s->cx; + break; + case arg_DX: + rv = ®s->dx; + break; + case arg_SP: + rv = ®s->sp; + break; + case arg_BP: + rv = ®s->bp; + break; + case arg_SI: + rv = ®s->si; + break; + case arg_DI: + rv = ®s->di; + break; +#ifdef __amd64__ + case arg_R8: + rv = ®s->r8; + break; + case arg_R9: + rv = ®s->r9; + break; + case arg_R10: + rv = ®s->r10; + break; + case arg_R11: + rv = ®s->r11; + break; + case arg_R12: + rv = ®s->r12; + break; + case arg_R13: + rv = ®s->r13; + break; + case arg_R14: + rv = ®s->r14; + break; + case arg_R15: + rv = ®s->r15; + break; +#endif + default: + printk(KERN_ERR "mmiotrace: Error reg no# %d\n", no); + } + + return rv; +} + +unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs) +{ + unsigned int opcode; + int reg; + unsigned char *p; + struct prefix_bits prf; + int i; + + p = (unsigned char *)ins_addr; + p += skip_prefix(p, &prf); + p += get_opcode(p, &opcode); + for (i = 0; i < ARRAY_SIZE(reg_rop); i++) + if (reg_rop[i] == opcode) + goto do_work; + + for (i = 0; i < ARRAY_SIZE(reg_wop); i++) + if (reg_wop[i] == opcode) + goto do_work; + + printk(KERN_ERR "mmiotrace: Not a register instruction, opcode " + "0x%02x\n", opcode); + goto err; + +do_work: + /* for STOS, source register is fixed */ + if (opcode == 0xAA || opcode == 0xAB) { + reg = arg_AX; + } else { + unsigned char mod_rm = *p; + reg = ((mod_rm >> 3) & 0x7) | (prf.rexr << 3); + } + switch (get_ins_reg_width(ins_addr)) { + case 1: + return *get_reg_w8(reg, prf.rex, regs); + + case 2: + return *(unsigned short *)get_reg_w32(reg, regs); + + case 4: + return *(unsigned int *)get_reg_w32(reg, regs); + +#ifdef __amd64__ + case 8: + return *(unsigned long *)get_reg_w32(reg, regs); +#endif + + default: + printk(KERN_ERR "mmiotrace: Error width# %d\n", reg); + } + +err: + return 0; +} + +unsigned long get_ins_imm_val(unsigned long ins_addr) +{ + unsigned int opcode; + unsigned char mod_rm; + unsigned char mod; + unsigned char *p; + struct prefix_bits prf; + int i; + + p = (unsigned char *)ins_addr; + p += skip_prefix(p, &prf); + p += get_opcode(p, &opcode); + for (i = 0; i < ARRAY_SIZE(imm_wop); i++) + if (imm_wop[i] == opcode) + goto do_work; + + printk(KERN_ERR "mmiotrace: Not an immediate instruction, opcode " + "0x%02x\n", opcode); + goto err; + +do_work: + mod_rm = *p; + mod = mod_rm >> 6; + p++; + switch (mod) { + case 0: + /* if r/m is 5 we have a 32 disp (IA32 Manual 3, Table 2-2) */ + /* AMD64: XXX Check for address size prefix? */ + if ((mod_rm & 0x7) == 0x5) + p += 4; + break; + + case 1: + p += 1; + break; + + case 2: + p += 4; + break; + + case 3: + default: + printk(KERN_ERR "mmiotrace: not a memory access instruction " + "at 0x%lx, rm_mod=0x%02x\n", + ins_addr, mod_rm); + } + + switch (get_ins_reg_width(ins_addr)) { + case 1: + return *(unsigned char *)p; + + case 2: + return *(unsigned short *)p; + + case 4: + return *(unsigned int *)p; + +#ifdef __amd64__ + case 8: + return *(unsigned long *)p; +#endif + + default: + printk(KERN_ERR "mmiotrace: Error: width.\n"); + } + +err: + return 0; +} diff --git a/arch/x86/mm/pf_in.h b/arch/x86/mm/pf_in.h new file mode 100644 index 000000000..e05341a51 --- /dev/null +++ b/arch/x86/mm/pf_in.h @@ -0,0 +1,39 @@ +/* + * Fault Injection Test harness (FI) + * Copyright (C) Intel Crop. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, + * USA. + * + */ + +#ifndef __PF_H_ +#define __PF_H_ + +enum reason_type { + NOT_ME, /* page fault is not in regions */ + NOTHING, /* access others point in regions */ + REG_READ, /* read from addr to reg */ + REG_WRITE, /* write from reg to addr */ + IMM_WRITE, /* write from imm to addr */ + OTHERS /* Other instructions can not intercept */ +}; + +enum reason_type get_ins_type(unsigned long ins_addr); +unsigned int get_ins_mem_width(unsigned long ins_addr); +unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs); +unsigned long get_ins_imm_val(unsigned long ins_addr); + +#endif /* __PF_H_ */ diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c new file mode 100644 index 000000000..c0e9c0040 --- /dev/null +++ b/arch/x86/mm/pgtable.c @@ -0,0 +1,891 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/mm.h> +#include <linux/gfp.h> +#include <linux/hugetlb.h> +#include <asm/pgalloc.h> +#include <asm/pgtable.h> +#include <asm/tlb.h> +#include <asm/fixmap.h> +#include <asm/mtrr.h> + +#ifdef CONFIG_DYNAMIC_PHYSICAL_MASK +phys_addr_t physical_mask __ro_after_init = (1ULL << __PHYSICAL_MASK_SHIFT) - 1; +EXPORT_SYMBOL(physical_mask); +#endif + +#define PGALLOC_GFP (GFP_KERNEL_ACCOUNT | __GFP_ZERO) + +#ifdef CONFIG_HIGHPTE +#define PGALLOC_USER_GFP __GFP_HIGHMEM +#else +#define PGALLOC_USER_GFP 0 +#endif + +gfp_t __userpte_alloc_gfp = PGALLOC_GFP | PGALLOC_USER_GFP; + +pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) +{ + return (pte_t *)__get_free_page(PGALLOC_GFP & ~__GFP_ACCOUNT); +} + +pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) +{ + struct page *pte; + + pte = alloc_pages(__userpte_alloc_gfp, 0); + if (!pte) + return NULL; + if (!pgtable_page_ctor(pte)) { + __free_page(pte); + return NULL; + } + return pte; +} + +static int __init setup_userpte(char *arg) +{ + if (!arg) + return -EINVAL; + + /* + * "userpte=nohigh" disables allocation of user pagetables in + * high memory. + */ + if (strcmp(arg, "nohigh") == 0) + __userpte_alloc_gfp &= ~__GFP_HIGHMEM; + else + return -EINVAL; + return 0; +} +early_param("userpte", setup_userpte); + +void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte) +{ + pgtable_page_dtor(pte); + paravirt_release_pte(page_to_pfn(pte)); + paravirt_tlb_remove_table(tlb, pte); +} + +#if CONFIG_PGTABLE_LEVELS > 2 +void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) +{ + struct page *page = virt_to_page(pmd); + paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT); + /* + * NOTE! For PAE, any changes to the top page-directory-pointer-table + * entries need a full cr3 reload to flush. + */ +#ifdef CONFIG_X86_PAE + tlb->need_flush_all = 1; +#endif + pgtable_pmd_page_dtor(page); + paravirt_tlb_remove_table(tlb, page); +} + +#if CONFIG_PGTABLE_LEVELS > 3 +void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud) +{ + paravirt_release_pud(__pa(pud) >> PAGE_SHIFT); + paravirt_tlb_remove_table(tlb, virt_to_page(pud)); +} + +#if CONFIG_PGTABLE_LEVELS > 4 +void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d) +{ + paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT); + paravirt_tlb_remove_table(tlb, virt_to_page(p4d)); +} +#endif /* CONFIG_PGTABLE_LEVELS > 4 */ +#endif /* CONFIG_PGTABLE_LEVELS > 3 */ +#endif /* CONFIG_PGTABLE_LEVELS > 2 */ + +static inline void pgd_list_add(pgd_t *pgd) +{ + struct page *page = virt_to_page(pgd); + + list_add(&page->lru, &pgd_list); +} + +static inline void pgd_list_del(pgd_t *pgd) +{ + struct page *page = virt_to_page(pgd); + + list_del(&page->lru); +} + +#define UNSHARED_PTRS_PER_PGD \ + (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD) +#define MAX_UNSHARED_PTRS_PER_PGD \ + max_t(size_t, KERNEL_PGD_BOUNDARY, PTRS_PER_PGD) + + +static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm) +{ + virt_to_page(pgd)->pt_mm = mm; +} + +struct mm_struct *pgd_page_get_mm(struct page *page) +{ + return page->pt_mm; +} + +static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd) +{ + /* If the pgd points to a shared pagetable level (either the + ptes in non-PAE, or shared PMD in PAE), then just copy the + references from swapper_pg_dir. */ + if (CONFIG_PGTABLE_LEVELS == 2 || + (CONFIG_PGTABLE_LEVELS == 3 && SHARED_KERNEL_PMD) || + CONFIG_PGTABLE_LEVELS >= 4) { + clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY, + swapper_pg_dir + KERNEL_PGD_BOUNDARY, + KERNEL_PGD_PTRS); + } + + /* list required to sync kernel mapping updates */ + if (!SHARED_KERNEL_PMD) { + pgd_set_mm(pgd, mm); + pgd_list_add(pgd); + } +} + +static void pgd_dtor(pgd_t *pgd) +{ + if (SHARED_KERNEL_PMD) + return; + + spin_lock(&pgd_lock); + pgd_list_del(pgd); + spin_unlock(&pgd_lock); +} + +/* + * List of all pgd's needed for non-PAE so it can invalidate entries + * in both cached and uncached pgd's; not needed for PAE since the + * kernel pmd is shared. If PAE were not to share the pmd a similar + * tactic would be needed. This is essentially codepath-based locking + * against pageattr.c; it is the unique case in which a valid change + * of kernel pagetables can't be lazily synchronized by vmalloc faults. + * vmalloc faults work because attached pagetables are never freed. + * -- nyc + */ + +#ifdef CONFIG_X86_PAE +/* + * In PAE mode, we need to do a cr3 reload (=tlb flush) when + * updating the top-level pagetable entries to guarantee the + * processor notices the update. Since this is expensive, and + * all 4 top-level entries are used almost immediately in a + * new process's life, we just pre-populate them here. + * + * Also, if we're in a paravirt environment where the kernel pmd is + * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate + * and initialize the kernel pmds here. + */ +#define PREALLOCATED_PMDS UNSHARED_PTRS_PER_PGD +#define MAX_PREALLOCATED_PMDS MAX_UNSHARED_PTRS_PER_PGD + +/* + * We allocate separate PMDs for the kernel part of the user page-table + * when PTI is enabled. We need them to map the per-process LDT into the + * user-space page-table. + */ +#define PREALLOCATED_USER_PMDS (static_cpu_has(X86_FEATURE_PTI) ? \ + KERNEL_PGD_PTRS : 0) +#define MAX_PREALLOCATED_USER_PMDS KERNEL_PGD_PTRS + +void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd) +{ + paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT); + + /* Note: almost everything apart from _PAGE_PRESENT is + reserved at the pmd (PDPT) level. */ + set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT)); + + /* + * According to Intel App note "TLBs, Paging-Structure Caches, + * and Their Invalidation", April 2007, document 317080-001, + * section 8.1: in PAE mode we explicitly have to flush the + * TLB via cr3 if the top-level pgd is changed... + */ + flush_tlb_mm(mm); +} +#else /* !CONFIG_X86_PAE */ + +/* No need to prepopulate any pagetable entries in non-PAE modes. */ +#define PREALLOCATED_PMDS 0 +#define MAX_PREALLOCATED_PMDS 0 +#define PREALLOCATED_USER_PMDS 0 +#define MAX_PREALLOCATED_USER_PMDS 0 +#endif /* CONFIG_X86_PAE */ + +static void free_pmds(struct mm_struct *mm, pmd_t *pmds[], int count) +{ + int i; + + for (i = 0; i < count; i++) + if (pmds[i]) { + pgtable_pmd_page_dtor(virt_to_page(pmds[i])); + free_page((unsigned long)pmds[i]); + mm_dec_nr_pmds(mm); + } +} + +static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[], int count) +{ + int i; + bool failed = false; + gfp_t gfp = PGALLOC_GFP; + + if (mm == &init_mm) + gfp &= ~__GFP_ACCOUNT; + + for (i = 0; i < count; i++) { + pmd_t *pmd = (pmd_t *)__get_free_page(gfp); + if (!pmd) + failed = true; + if (pmd && !pgtable_pmd_page_ctor(virt_to_page(pmd))) { + free_page((unsigned long)pmd); + pmd = NULL; + failed = true; + } + if (pmd) + mm_inc_nr_pmds(mm); + pmds[i] = pmd; + } + + if (failed) { + free_pmds(mm, pmds, count); + return -ENOMEM; + } + + return 0; +} + +/* + * Mop up any pmd pages which may still be attached to the pgd. + * Normally they will be freed by munmap/exit_mmap, but any pmd we + * preallocate which never got a corresponding vma will need to be + * freed manually. + */ +static void mop_up_one_pmd(struct mm_struct *mm, pgd_t *pgdp) +{ + pgd_t pgd = *pgdp; + + if (pgd_val(pgd) != 0) { + pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd); + + pgd_clear(pgdp); + + paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT); + pmd_free(mm, pmd); + mm_dec_nr_pmds(mm); + } +} + +static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp) +{ + int i; + + for (i = 0; i < PREALLOCATED_PMDS; i++) + mop_up_one_pmd(mm, &pgdp[i]); + +#ifdef CONFIG_PAGE_TABLE_ISOLATION + + if (!static_cpu_has(X86_FEATURE_PTI)) + return; + + pgdp = kernel_to_user_pgdp(pgdp); + + for (i = 0; i < PREALLOCATED_USER_PMDS; i++) + mop_up_one_pmd(mm, &pgdp[i + KERNEL_PGD_BOUNDARY]); +#endif +} + +static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[]) +{ + p4d_t *p4d; + pud_t *pud; + int i; + + if (PREALLOCATED_PMDS == 0) /* Work around gcc-3.4.x bug */ + return; + + p4d = p4d_offset(pgd, 0); + pud = pud_offset(p4d, 0); + + for (i = 0; i < PREALLOCATED_PMDS; i++, pud++) { + pmd_t *pmd = pmds[i]; + + if (i >= KERNEL_PGD_BOUNDARY) + memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]), + sizeof(pmd_t) * PTRS_PER_PMD); + + pud_populate(mm, pud, pmd); + } +} + +#ifdef CONFIG_PAGE_TABLE_ISOLATION +static void pgd_prepopulate_user_pmd(struct mm_struct *mm, + pgd_t *k_pgd, pmd_t *pmds[]) +{ + pgd_t *s_pgd = kernel_to_user_pgdp(swapper_pg_dir); + pgd_t *u_pgd = kernel_to_user_pgdp(k_pgd); + p4d_t *u_p4d; + pud_t *u_pud; + int i; + + u_p4d = p4d_offset(u_pgd, 0); + u_pud = pud_offset(u_p4d, 0); + + s_pgd += KERNEL_PGD_BOUNDARY; + u_pud += KERNEL_PGD_BOUNDARY; + + for (i = 0; i < PREALLOCATED_USER_PMDS; i++, u_pud++, s_pgd++) { + pmd_t *pmd = pmds[i]; + + memcpy(pmd, (pmd_t *)pgd_page_vaddr(*s_pgd), + sizeof(pmd_t) * PTRS_PER_PMD); + + pud_populate(mm, u_pud, pmd); + } + +} +#else +static void pgd_prepopulate_user_pmd(struct mm_struct *mm, + pgd_t *k_pgd, pmd_t *pmds[]) +{ +} +#endif +/* + * Xen paravirt assumes pgd table should be in one page. 64 bit kernel also + * assumes that pgd should be in one page. + * + * But kernel with PAE paging that is not running as a Xen domain + * only needs to allocate 32 bytes for pgd instead of one page. + */ +#ifdef CONFIG_X86_PAE + +#include <linux/slab.h> + +#define PGD_SIZE (PTRS_PER_PGD * sizeof(pgd_t)) +#define PGD_ALIGN 32 + +static struct kmem_cache *pgd_cache; + +static int __init pgd_cache_init(void) +{ + /* + * When PAE kernel is running as a Xen domain, it does not use + * shared kernel pmd. And this requires a whole page for pgd. + */ + if (!SHARED_KERNEL_PMD) + return 0; + + /* + * when PAE kernel is not running as a Xen domain, it uses + * shared kernel pmd. Shared kernel pmd does not require a whole + * page for pgd. We are able to just allocate a 32-byte for pgd. + * During boot time, we create a 32-byte slab for pgd table allocation. + */ + pgd_cache = kmem_cache_create("pgd_cache", PGD_SIZE, PGD_ALIGN, + SLAB_PANIC, NULL); + return 0; +} +core_initcall(pgd_cache_init); + +static inline pgd_t *_pgd_alloc(void) +{ + /* + * If no SHARED_KERNEL_PMD, PAE kernel is running as a Xen domain. + * We allocate one page for pgd. + */ + if (!SHARED_KERNEL_PMD) + return (pgd_t *)__get_free_pages(PGALLOC_GFP, + PGD_ALLOCATION_ORDER); + + /* + * Now PAE kernel is not running as a Xen domain. We can allocate + * a 32-byte slab for pgd to save memory space. + */ + return kmem_cache_alloc(pgd_cache, PGALLOC_GFP); +} + +static inline void _pgd_free(pgd_t *pgd) +{ + if (!SHARED_KERNEL_PMD) + free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER); + else + kmem_cache_free(pgd_cache, pgd); +} +#else + +static inline pgd_t *_pgd_alloc(void) +{ + return (pgd_t *)__get_free_pages(PGALLOC_GFP, PGD_ALLOCATION_ORDER); +} + +static inline void _pgd_free(pgd_t *pgd) +{ + free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER); +} +#endif /* CONFIG_X86_PAE */ + +pgd_t *pgd_alloc(struct mm_struct *mm) +{ + pgd_t *pgd; + pmd_t *u_pmds[MAX_PREALLOCATED_USER_PMDS]; + pmd_t *pmds[MAX_PREALLOCATED_PMDS]; + + pgd = _pgd_alloc(); + + if (pgd == NULL) + goto out; + + mm->pgd = pgd; + + if (preallocate_pmds(mm, pmds, PREALLOCATED_PMDS) != 0) + goto out_free_pgd; + + if (preallocate_pmds(mm, u_pmds, PREALLOCATED_USER_PMDS) != 0) + goto out_free_pmds; + + if (paravirt_pgd_alloc(mm) != 0) + goto out_free_user_pmds; + + /* + * Make sure that pre-populating the pmds is atomic with + * respect to anything walking the pgd_list, so that they + * never see a partially populated pgd. + */ + spin_lock(&pgd_lock); + + pgd_ctor(mm, pgd); + pgd_prepopulate_pmd(mm, pgd, pmds); + pgd_prepopulate_user_pmd(mm, pgd, u_pmds); + + spin_unlock(&pgd_lock); + + return pgd; + +out_free_user_pmds: + free_pmds(mm, u_pmds, PREALLOCATED_USER_PMDS); +out_free_pmds: + free_pmds(mm, pmds, PREALLOCATED_PMDS); +out_free_pgd: + _pgd_free(pgd); +out: + return NULL; +} + +void pgd_free(struct mm_struct *mm, pgd_t *pgd) +{ + pgd_mop_up_pmds(mm, pgd); + pgd_dtor(pgd); + paravirt_pgd_free(mm, pgd); + _pgd_free(pgd); +} + +/* + * Used to set accessed or dirty bits in the page table entries + * on other architectures. On x86, the accessed and dirty bits + * are tracked by hardware. However, do_wp_page calls this function + * to also make the pte writeable at the same time the dirty bit is + * set. In that case we do actually need to write the PTE. + */ +int ptep_set_access_flags(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep, + pte_t entry, int dirty) +{ + int changed = !pte_same(*ptep, entry); + + if (changed && dirty) + set_pte(ptep, entry); + + return changed; +} + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +int pmdp_set_access_flags(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp, + pmd_t entry, int dirty) +{ + int changed = !pmd_same(*pmdp, entry); + + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + + if (changed && dirty) { + set_pmd(pmdp, entry); + /* + * We had a write-protection fault here and changed the pmd + * to to more permissive. No need to flush the TLB for that, + * #PF is architecturally guaranteed to do that and in the + * worst-case we'll generate a spurious fault. + */ + } + + return changed; +} + +int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address, + pud_t *pudp, pud_t entry, int dirty) +{ + int changed = !pud_same(*pudp, entry); + + VM_BUG_ON(address & ~HPAGE_PUD_MASK); + + if (changed && dirty) { + set_pud(pudp, entry); + /* + * We had a write-protection fault here and changed the pud + * to to more permissive. No need to flush the TLB for that, + * #PF is architecturally guaranteed to do that and in the + * worst-case we'll generate a spurious fault. + */ + } + + return changed; +} +#endif + +int ptep_test_and_clear_young(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) +{ + int ret = 0; + + if (pte_young(*ptep)) + ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, + (unsigned long *) &ptep->pte); + + return ret; +} + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +int pmdp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmdp) +{ + int ret = 0; + + if (pmd_young(*pmdp)) + ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, + (unsigned long *)pmdp); + + return ret; +} +int pudp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long addr, pud_t *pudp) +{ + int ret = 0; + + if (pud_young(*pudp)) + ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, + (unsigned long *)pudp); + + return ret; +} +#endif + +int ptep_clear_flush_young(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep) +{ + /* + * On x86 CPUs, clearing the accessed bit without a TLB flush + * doesn't cause data corruption. [ It could cause incorrect + * page aging and the (mistaken) reclaim of hot pages, but the + * chance of that should be relatively low. ] + * + * So as a performance optimization don't flush the TLB when + * clearing the accessed bit, it will eventually be flushed by + * a context switch or a VM operation anyway. [ In the rare + * event of it not getting flushed for a long time the delay + * shouldn't really matter because there's no real memory + * pressure for swapout to react to. ] + */ + return ptep_test_and_clear_young(vma, address, ptep); +} + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +int pmdp_clear_flush_young(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp) +{ + int young; + + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + + young = pmdp_test_and_clear_young(vma, address, pmdp); + if (young) + flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); + + return young; +} +#endif + +/** + * reserve_top_address - reserves a hole in the top of kernel address space + * @reserve - size of hole to reserve + * + * Can be used to relocate the fixmap area and poke a hole in the top + * of kernel address space to make room for a hypervisor. + */ +void __init reserve_top_address(unsigned long reserve) +{ +#ifdef CONFIG_X86_32 + BUG_ON(fixmaps_set > 0); + __FIXADDR_TOP = round_down(-reserve, 1 << PMD_SHIFT) - PAGE_SIZE; + printk(KERN_INFO "Reserving virtual address space above 0x%08lx (rounded to 0x%08lx)\n", + -reserve, __FIXADDR_TOP + PAGE_SIZE); +#endif +} + +int fixmaps_set; + +void __native_set_fixmap(enum fixed_addresses idx, pte_t pte) +{ + unsigned long address = __fix_to_virt(idx); + +#ifdef CONFIG_X86_64 + /* + * Ensure that the static initial page tables are covering the + * fixmap completely. + */ + BUILD_BUG_ON(__end_of_permanent_fixed_addresses > + (FIXMAP_PMD_NUM * PTRS_PER_PTE)); +#endif + + if (idx >= __end_of_fixed_addresses) { + BUG(); + return; + } + set_pte_vaddr(address, pte); + fixmaps_set++; +} + +void native_set_fixmap(unsigned /* enum fixed_addresses */ idx, + phys_addr_t phys, pgprot_t flags) +{ + /* Sanitize 'prot' against any unsupported bits: */ + pgprot_val(flags) &= __default_kernel_pte_mask; + + __native_set_fixmap(idx, pfn_pte(phys >> PAGE_SHIFT, flags)); +} + +#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP +#ifdef CONFIG_X86_5LEVEL +/** + * p4d_set_huge - setup kernel P4D mapping + * + * No 512GB pages yet -- always return 0 + */ +int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot) +{ + return 0; +} + +/** + * p4d_clear_huge - clear kernel P4D mapping when it is set + * + * No 512GB pages yet -- always return 0 + */ +int p4d_clear_huge(p4d_t *p4d) +{ + return 0; +} +#endif + +/** + * pud_set_huge - setup kernel PUD mapping + * + * MTRRs can override PAT memory types with 4KiB granularity. Therefore, this + * function sets up a huge page only if any of the following conditions are met: + * + * - MTRRs are disabled, or + * + * - MTRRs are enabled and the range is completely covered by a single MTRR, or + * + * - MTRRs are enabled and the corresponding MTRR memory type is WB, which + * has no effect on the requested PAT memory type. + * + * Callers should try to decrease page size (1GB -> 2MB -> 4K) if the bigger + * page mapping attempt fails. + * + * Returns 1 on success and 0 on failure. + */ +int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot) +{ + u8 mtrr, uniform; + + mtrr = mtrr_type_lookup(addr, addr + PUD_SIZE, &uniform); + if ((mtrr != MTRR_TYPE_INVALID) && (!uniform) && + (mtrr != MTRR_TYPE_WRBACK)) + return 0; + + /* Bail out if we are we on a populated non-leaf entry: */ + if (pud_present(*pud) && !pud_huge(*pud)) + return 0; + + prot = pgprot_4k_2_large(prot); + + set_pte((pte_t *)pud, pfn_pte( + (u64)addr >> PAGE_SHIFT, + __pgprot(pgprot_val(prot) | _PAGE_PSE))); + + return 1; +} + +/** + * pmd_set_huge - setup kernel PMD mapping + * + * See text over pud_set_huge() above. + * + * Returns 1 on success and 0 on failure. + */ +int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot) +{ + u8 mtrr, uniform; + + mtrr = mtrr_type_lookup(addr, addr + PMD_SIZE, &uniform); + if ((mtrr != MTRR_TYPE_INVALID) && (!uniform) && + (mtrr != MTRR_TYPE_WRBACK)) { + pr_warn_once("%s: Cannot satisfy [mem %#010llx-%#010llx] with a huge-page mapping due to MTRR override.\n", + __func__, addr, addr + PMD_SIZE); + return 0; + } + + /* Bail out if we are we on a populated non-leaf entry: */ + if (pmd_present(*pmd) && !pmd_huge(*pmd)) + return 0; + + prot = pgprot_4k_2_large(prot); + + set_pte((pte_t *)pmd, pfn_pte( + (u64)addr >> PAGE_SHIFT, + __pgprot(pgprot_val(prot) | _PAGE_PSE))); + + return 1; +} + +/** + * pud_clear_huge - clear kernel PUD mapping when it is set + * + * Returns 1 on success and 0 on failure (no PUD map is found). + */ +int pud_clear_huge(pud_t *pud) +{ + if (pud_large(*pud)) { + pud_clear(pud); + return 1; + } + + return 0; +} + +/** + * pmd_clear_huge - clear kernel PMD mapping when it is set + * + * Returns 1 on success and 0 on failure (no PMD map is found). + */ +int pmd_clear_huge(pmd_t *pmd) +{ + if (pmd_large(*pmd)) { + pmd_clear(pmd); + return 1; + } + + return 0; +} + +#ifdef CONFIG_X86_64 +/** + * pud_free_pmd_page - Clear pud entry and free pmd page. + * @pud: Pointer to a PUD. + * @addr: Virtual address associated with pud. + * + * Context: The pud range has been unmapped and TLB purged. + * Return: 1 if clearing the entry succeeded. 0 otherwise. + * + * NOTE: Callers must allow a single page allocation. + */ +int pud_free_pmd_page(pud_t *pud, unsigned long addr) +{ + pmd_t *pmd, *pmd_sv; + pte_t *pte; + int i; + + if (pud_none(*pud)) + return 1; + + pmd = (pmd_t *)pud_page_vaddr(*pud); + pmd_sv = (pmd_t *)__get_free_page(GFP_KERNEL); + if (!pmd_sv) + return 0; + + for (i = 0; i < PTRS_PER_PMD; i++) { + pmd_sv[i] = pmd[i]; + if (!pmd_none(pmd[i])) + pmd_clear(&pmd[i]); + } + + pud_clear(pud); + + /* INVLPG to clear all paging-structure caches */ + flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1); + + for (i = 0; i < PTRS_PER_PMD; i++) { + if (!pmd_none(pmd_sv[i])) { + pte = (pte_t *)pmd_page_vaddr(pmd_sv[i]); + free_page((unsigned long)pte); + } + } + + free_page((unsigned long)pmd_sv); + + pgtable_pmd_page_dtor(virt_to_page(pmd)); + free_page((unsigned long)pmd); + + return 1; +} + +/** + * pmd_free_pte_page - Clear pmd entry and free pte page. + * @pmd: Pointer to a PMD. + * @addr: Virtual address associated with pmd. + * + * Context: The pmd range has been unmapped and TLB purged. + * Return: 1 if clearing the entry succeeded. 0 otherwise. + */ +int pmd_free_pte_page(pmd_t *pmd, unsigned long addr) +{ + pte_t *pte; + + if (pmd_none(*pmd)) + return 1; + + pte = (pte_t *)pmd_page_vaddr(*pmd); + pmd_clear(pmd); + + /* INVLPG to clear all paging-structure caches */ + flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1); + + free_page((unsigned long)pte); + + return 1; +} + +#else /* !CONFIG_X86_64 */ + +int pud_free_pmd_page(pud_t *pud, unsigned long addr) +{ + return pud_none(*pud); +} + +/* + * Disable free page handling on x86-PAE. This assures that ioremap() + * does not update sync'd pmd entries. See vmalloc_sync_one(). + */ +int pmd_free_pte_page(pmd_t *pmd, unsigned long addr) +{ + return pmd_none(*pmd); +} + +#endif /* CONFIG_X86_64 */ +#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */ diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c new file mode 100644 index 000000000..9bb7f0ab9 --- /dev/null +++ b/arch/x86/mm/pgtable_32.c @@ -0,0 +1,105 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/mm.h> +#include <linux/nmi.h> +#include <linux/swap.h> +#include <linux/smp.h> +#include <linux/highmem.h> +#include <linux/pagemap.h> +#include <linux/spinlock.h> + +#include <asm/cpu_entry_area.h> +#include <asm/pgtable.h> +#include <asm/pgalloc.h> +#include <asm/fixmap.h> +#include <asm/e820/api.h> +#include <asm/tlb.h> +#include <asm/tlbflush.h> +#include <asm/io.h> + +unsigned int __VMALLOC_RESERVE = 128 << 20; + +/* + * Associate a virtual page frame with a given physical page frame + * and protection flags for that frame. + */ +void set_pte_vaddr(unsigned long vaddr, pte_t pteval) +{ + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + pgd = swapper_pg_dir + pgd_index(vaddr); + if (pgd_none(*pgd)) { + BUG(); + return; + } + p4d = p4d_offset(pgd, vaddr); + if (p4d_none(*p4d)) { + BUG(); + return; + } + pud = pud_offset(p4d, vaddr); + if (pud_none(*pud)) { + BUG(); + return; + } + pmd = pmd_offset(pud, vaddr); + if (pmd_none(*pmd)) { + BUG(); + return; + } + pte = pte_offset_kernel(pmd, vaddr); + if (!pte_none(pteval)) + set_pte_at(&init_mm, vaddr, pte, pteval); + else + pte_clear(&init_mm, vaddr, pte); + + /* + * It's enough to flush this one mapping. + * (PGE mappings get flushed as well) + */ + __flush_tlb_one_kernel(vaddr); +} + +unsigned long __FIXADDR_TOP = 0xfffff000; +EXPORT_SYMBOL(__FIXADDR_TOP); + +/* + * vmalloc=size forces the vmalloc area to be exactly 'size' + * bytes. This can be used to increase (or decrease) the + * vmalloc area - the default is 128m. + */ +static int __init parse_vmalloc(char *arg) +{ + if (!arg) + return -EINVAL; + + /* Add VMALLOC_OFFSET to the parsed value due to vm area guard hole*/ + __VMALLOC_RESERVE = memparse(arg, &arg) + VMALLOC_OFFSET; + return 0; +} +early_param("vmalloc", parse_vmalloc); + +/* + * reservetop=size reserves a hole at the top of the kernel address space which + * a hypervisor can load into later. Needed for dynamically loaded hypervisors, + * so relocating the fixmap can be done before paging initialization. + */ +static int __init parse_reservetop(char *arg) +{ + unsigned long address; + + if (!arg) + return -EINVAL; + + address = memparse(arg, &arg); + reserve_top_address(address); + early_ioremap_init(); + return 0; +} +early_param("reservetop", parse_reservetop); diff --git a/arch/x86/mm/physaddr.c b/arch/x86/mm/physaddr.c new file mode 100644 index 000000000..7f9acb683 --- /dev/null +++ b/arch/x86/mm/physaddr.c @@ -0,0 +1,99 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/bootmem.h> +#include <linux/mmdebug.h> +#include <linux/export.h> +#include <linux/mm.h> + +#include <asm/page.h> + +#include "physaddr.h" + +#ifdef CONFIG_X86_64 + +#ifdef CONFIG_DEBUG_VIRTUAL +unsigned long __phys_addr(unsigned long x) +{ + unsigned long y = x - __START_KERNEL_map; + + /* use the carry flag to determine if x was < __START_KERNEL_map */ + if (unlikely(x > y)) { + x = y + phys_base; + + VIRTUAL_BUG_ON(y >= KERNEL_IMAGE_SIZE); + } else { + x = y + (__START_KERNEL_map - PAGE_OFFSET); + + /* carry flag will be set if starting x was >= PAGE_OFFSET */ + VIRTUAL_BUG_ON((x > y) || !phys_addr_valid(x)); + } + + return x; +} +EXPORT_SYMBOL(__phys_addr); + +unsigned long __phys_addr_symbol(unsigned long x) +{ + unsigned long y = x - __START_KERNEL_map; + + /* only check upper bounds since lower bounds will trigger carry */ + VIRTUAL_BUG_ON(y >= KERNEL_IMAGE_SIZE); + + return y + phys_base; +} +EXPORT_SYMBOL(__phys_addr_symbol); +#endif + +bool __virt_addr_valid(unsigned long x) +{ + unsigned long y = x - __START_KERNEL_map; + + /* use the carry flag to determine if x was < __START_KERNEL_map */ + if (unlikely(x > y)) { + x = y + phys_base; + + if (y >= KERNEL_IMAGE_SIZE) + return false; + } else { + x = y + (__START_KERNEL_map - PAGE_OFFSET); + + /* carry flag will be set if starting x was >= PAGE_OFFSET */ + if ((x > y) || !phys_addr_valid(x)) + return false; + } + + return pfn_valid(x >> PAGE_SHIFT); +} +EXPORT_SYMBOL(__virt_addr_valid); + +#else + +#ifdef CONFIG_DEBUG_VIRTUAL +unsigned long __phys_addr(unsigned long x) +{ + unsigned long phys_addr = x - PAGE_OFFSET; + /* VMALLOC_* aren't constants */ + VIRTUAL_BUG_ON(x < PAGE_OFFSET); + VIRTUAL_BUG_ON(__vmalloc_start_set && is_vmalloc_addr((void *) x)); + /* max_low_pfn is set early, but not _that_ early */ + if (max_low_pfn) { + VIRTUAL_BUG_ON((phys_addr >> PAGE_SHIFT) > max_low_pfn); + BUG_ON(slow_virt_to_phys((void *)x) != phys_addr); + } + return phys_addr; +} +EXPORT_SYMBOL(__phys_addr); +#endif + +bool __virt_addr_valid(unsigned long x) +{ + if (x < PAGE_OFFSET) + return false; + if (__vmalloc_start_set && is_vmalloc_addr((void *) x)) + return false; + if (x >= FIXADDR_START) + return false; + return pfn_valid((x - PAGE_OFFSET) >> PAGE_SHIFT); +} +EXPORT_SYMBOL(__virt_addr_valid); + +#endif /* CONFIG_X86_64 */ diff --git a/arch/x86/mm/physaddr.h b/arch/x86/mm/physaddr.h new file mode 100644 index 000000000..9f6419caf --- /dev/null +++ b/arch/x86/mm/physaddr.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include <asm/processor.h> + +static inline int phys_addr_valid(resource_size_t addr) +{ +#ifdef CONFIG_PHYS_ADDR_T_64BIT + return !(addr >> boot_cpu_data.x86_phys_bits); +#else + return 1; +#endif +} diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c new file mode 100644 index 000000000..6e98e0a7c --- /dev/null +++ b/arch/x86/mm/pkeys.c @@ -0,0 +1,226 @@ +/* + * Intel Memory Protection Keys management + * Copyright (c) 2015, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ +#include <linux/debugfs.h> /* debugfs_create_u32() */ +#include <linux/mm_types.h> /* mm_struct, vma, etc... */ +#include <linux/pkeys.h> /* PKEY_* */ +#include <uapi/asm-generic/mman-common.h> + +#include <asm/cpufeature.h> /* boot_cpu_has, ... */ +#include <asm/mmu_context.h> /* vma_pkey() */ + +int __execute_only_pkey(struct mm_struct *mm) +{ + bool need_to_set_mm_pkey = false; + int execute_only_pkey = mm->context.execute_only_pkey; + int ret; + + /* Do we need to assign a pkey for mm's execute-only maps? */ + if (execute_only_pkey == -1) { + /* Go allocate one to use, which might fail */ + execute_only_pkey = mm_pkey_alloc(mm); + if (execute_only_pkey < 0) + return -1; + need_to_set_mm_pkey = true; + } + + /* + * We do not want to go through the relatively costly + * dance to set PKRU if we do not need to. Check it + * first and assume that if the execute-only pkey is + * write-disabled that we do not have to set it + * ourselves. We need preempt off so that nobody + * can make fpregs inactive. + */ + preempt_disable(); + if (!need_to_set_mm_pkey && + current->thread.fpu.initialized && + !__pkru_allows_read(read_pkru(), execute_only_pkey)) { + preempt_enable(); + return execute_only_pkey; + } + preempt_enable(); + + /* + * Set up PKRU so that it denies access for everything + * other than execution. + */ + ret = arch_set_user_pkey_access(current, execute_only_pkey, + PKEY_DISABLE_ACCESS); + /* + * If the PKRU-set operation failed somehow, just return + * 0 and effectively disable execute-only support. + */ + if (ret) { + mm_set_pkey_free(mm, execute_only_pkey); + return -1; + } + + /* We got one, store it and use it from here on out */ + if (need_to_set_mm_pkey) + mm->context.execute_only_pkey = execute_only_pkey; + return execute_only_pkey; +} + +static inline bool vma_is_pkey_exec_only(struct vm_area_struct *vma) +{ + /* Do this check first since the vm_flags should be hot */ + if ((vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)) != VM_EXEC) + return false; + if (vma_pkey(vma) != vma->vm_mm->context.execute_only_pkey) + return false; + + return true; +} + +/* + * This is only called for *plain* mprotect calls. + */ +int __arch_override_mprotect_pkey(struct vm_area_struct *vma, int prot, int pkey) +{ + /* + * Is this an mprotect_pkey() call? If so, never + * override the value that came from the user. + */ + if (pkey != -1) + return pkey; + + /* + * The mapping is execute-only. Go try to get the + * execute-only protection key. If we fail to do that, + * fall through as if we do not have execute-only + * support in this mm. + */ + if (prot == PROT_EXEC) { + pkey = execute_only_pkey(vma->vm_mm); + if (pkey > 0) + return pkey; + } else if (vma_is_pkey_exec_only(vma)) { + /* + * Protections are *not* PROT_EXEC, but the mapping + * is using the exec-only pkey. This mapping was + * PROT_EXEC and will no longer be. Move back to + * the default pkey. + */ + return ARCH_DEFAULT_PKEY; + } + + /* + * This is a vanilla, non-pkey mprotect (or we failed to + * setup execute-only), inherit the pkey from the VMA we + * are working on. + */ + return vma_pkey(vma); +} + +#define PKRU_AD_KEY(pkey) (PKRU_AD_BIT << ((pkey) * PKRU_BITS_PER_PKEY)) + +/* + * Make the default PKRU value (at execve() time) as restrictive + * as possible. This ensures that any threads clone()'d early + * in the process's lifetime will not accidentally get access + * to data which is pkey-protected later on. + */ +u32 init_pkru_value = PKRU_AD_KEY( 1) | PKRU_AD_KEY( 2) | PKRU_AD_KEY( 3) | + PKRU_AD_KEY( 4) | PKRU_AD_KEY( 5) | PKRU_AD_KEY( 6) | + PKRU_AD_KEY( 7) | PKRU_AD_KEY( 8) | PKRU_AD_KEY( 9) | + PKRU_AD_KEY(10) | PKRU_AD_KEY(11) | PKRU_AD_KEY(12) | + PKRU_AD_KEY(13) | PKRU_AD_KEY(14) | PKRU_AD_KEY(15); + +/* + * Called from the FPU code when creating a fresh set of FPU + * registers. This is called from a very specific context where + * we know the FPU regstiers are safe for use and we can use PKRU + * directly. + */ +void copy_init_pkru_to_fpregs(void) +{ + u32 init_pkru_value_snapshot = READ_ONCE(init_pkru_value); + /* + * Any write to PKRU takes it out of the XSAVE 'init + * state' which increases context switch cost. Avoid + * writing 0 when PKRU was already 0. + */ + if (!init_pkru_value_snapshot && !read_pkru()) + return; + /* + * Override the PKRU state that came from 'init_fpstate' + * with the baseline from the process. + */ + write_pkru(init_pkru_value_snapshot); +} + +static ssize_t init_pkru_read_file(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos) +{ + char buf[32]; + unsigned int len; + + len = sprintf(buf, "0x%x\n", init_pkru_value); + return simple_read_from_buffer(user_buf, count, ppos, buf, len); +} + +static ssize_t init_pkru_write_file(struct file *file, + const char __user *user_buf, size_t count, loff_t *ppos) +{ + char buf[32]; + ssize_t len; + u32 new_init_pkru; + + len = min(count, sizeof(buf) - 1); + if (copy_from_user(buf, user_buf, len)) + return -EFAULT; + + /* Make the buffer a valid string that we can not overrun */ + buf[len] = '\0'; + if (kstrtouint(buf, 0, &new_init_pkru)) + return -EINVAL; + + /* + * Don't allow insane settings that will blow the system + * up immediately if someone attempts to disable access + * or writes to pkey 0. + */ + if (new_init_pkru & (PKRU_AD_BIT|PKRU_WD_BIT)) + return -EINVAL; + + WRITE_ONCE(init_pkru_value, new_init_pkru); + return count; +} + +static const struct file_operations fops_init_pkru = { + .read = init_pkru_read_file, + .write = init_pkru_write_file, + .llseek = default_llseek, +}; + +static int __init create_init_pkru_value(void) +{ + debugfs_create_file("init_pkru", S_IRUSR | S_IWUSR, + arch_debugfs_dir, NULL, &fops_init_pkru); + return 0; +} +late_initcall(create_init_pkru_value); + +static __init int setup_init_pkru(char *opt) +{ + u32 new_init_pkru; + + if (kstrtouint(opt, 0, &new_init_pkru)) + return 1; + + WRITE_ONCE(init_pkru_value, new_init_pkru); + + return 1; +} +__setup("init_pkru=", setup_init_pkru); diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c new file mode 100644 index 000000000..622d5968c --- /dev/null +++ b/arch/x86/mm/pti.c @@ -0,0 +1,658 @@ +/* + * Copyright(c) 2017 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * This code is based in part on work published here: + * + * https://github.com/IAIK/KAISER + * + * The original work was written by and and signed off by for the Linux + * kernel by: + * + * Signed-off-by: Richard Fellner <richard.fellner@student.tugraz.at> + * Signed-off-by: Moritz Lipp <moritz.lipp@iaik.tugraz.at> + * Signed-off-by: Daniel Gruss <daniel.gruss@iaik.tugraz.at> + * Signed-off-by: Michael Schwarz <michael.schwarz@iaik.tugraz.at> + * + * Major changes to the original code by: Dave Hansen <dave.hansen@intel.com> + * Mostly rewritten by Thomas Gleixner <tglx@linutronix.de> and + * Andy Lutomirsky <luto@amacapital.net> + */ +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/string.h> +#include <linux/types.h> +#include <linux/bug.h> +#include <linux/init.h> +#include <linux/spinlock.h> +#include <linux/mm.h> +#include <linux/uaccess.h> +#include <linux/cpu.h> + +#include <asm/cpufeature.h> +#include <asm/hypervisor.h> +#include <asm/vsyscall.h> +#include <asm/cmdline.h> +#include <asm/pti.h> +#include <asm/pgtable.h> +#include <asm/pgalloc.h> +#include <asm/tlbflush.h> +#include <asm/desc.h> +#include <asm/sections.h> + +#undef pr_fmt +#define pr_fmt(fmt) "Kernel/User page tables isolation: " fmt + +/* Backporting helper */ +#ifndef __GFP_NOTRACK +#define __GFP_NOTRACK 0 +#endif + +/* + * Define the page-table levels we clone for user-space on 32 + * and 64 bit. + */ +#ifdef CONFIG_X86_64 +#define PTI_LEVEL_KERNEL_IMAGE PTI_CLONE_PMD +#else +#define PTI_LEVEL_KERNEL_IMAGE PTI_CLONE_PTE +#endif + +static void __init pti_print_if_insecure(const char *reason) +{ + if (boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN)) + pr_info("%s\n", reason); +} + +static void __init pti_print_if_secure(const char *reason) +{ + if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN)) + pr_info("%s\n", reason); +} + +enum pti_mode { + PTI_AUTO = 0, + PTI_FORCE_OFF, + PTI_FORCE_ON +} pti_mode; + +void __init pti_check_boottime_disable(void) +{ + char arg[5]; + int ret; + + /* Assume mode is auto unless overridden. */ + pti_mode = PTI_AUTO; + + if (hypervisor_is_type(X86_HYPER_XEN_PV)) { + pti_mode = PTI_FORCE_OFF; + pti_print_if_insecure("disabled on XEN PV."); + return; + } + + ret = cmdline_find_option(boot_command_line, "pti", arg, sizeof(arg)); + if (ret > 0) { + if (ret == 3 && !strncmp(arg, "off", 3)) { + pti_mode = PTI_FORCE_OFF; + pti_print_if_insecure("disabled on command line."); + return; + } + if (ret == 2 && !strncmp(arg, "on", 2)) { + pti_mode = PTI_FORCE_ON; + pti_print_if_secure("force enabled on command line."); + goto enable; + } + if (ret == 4 && !strncmp(arg, "auto", 4)) { + pti_mode = PTI_AUTO; + goto autosel; + } + } + + if (cmdline_find_option_bool(boot_command_line, "nopti") || + cpu_mitigations_off()) { + pti_mode = PTI_FORCE_OFF; + pti_print_if_insecure("disabled on command line."); + return; + } + +autosel: + if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN)) + return; +enable: + setup_force_cpu_cap(X86_FEATURE_PTI); +} + +pgd_t __pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd) +{ + /* + * Changes to the high (kernel) portion of the kernelmode page + * tables are not automatically propagated to the usermode tables. + * + * Users should keep in mind that, unlike the kernelmode tables, + * there is no vmalloc_fault equivalent for the usermode tables. + * Top-level entries added to init_mm's usermode pgd after boot + * will not be automatically propagated to other mms. + */ + if (!pgdp_maps_userspace(pgdp)) + return pgd; + + /* + * The user page tables get the full PGD, accessible from + * userspace: + */ + kernel_to_user_pgdp(pgdp)->pgd = pgd.pgd; + + /* + * If this is normal user memory, make it NX in the kernel + * pagetables so that, if we somehow screw up and return to + * usermode with the kernel CR3 loaded, we'll get a page fault + * instead of allowing user code to execute with the wrong CR3. + * + * As exceptions, we don't set NX if: + * - _PAGE_USER is not set. This could be an executable + * EFI runtime mapping or something similar, and the kernel + * may execute from it + * - we don't have NX support + * - we're clearing the PGD (i.e. the new pgd is not present). + */ + if ((pgd.pgd & (_PAGE_USER|_PAGE_PRESENT)) == (_PAGE_USER|_PAGE_PRESENT) && + (__supported_pte_mask & _PAGE_NX)) + pgd.pgd |= _PAGE_NX; + + /* return the copy of the PGD we want the kernel to use: */ + return pgd; +} + +/* + * Walk the user copy of the page tables (optionally) trying to allocate + * page table pages on the way down. + * + * Returns a pointer to a P4D on success, or NULL on failure. + */ +static p4d_t *pti_user_pagetable_walk_p4d(unsigned long address) +{ + pgd_t *pgd = kernel_to_user_pgdp(pgd_offset_k(address)); + gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); + + if (address < PAGE_OFFSET) { + WARN_ONCE(1, "attempt to walk user address\n"); + return NULL; + } + + if (pgd_none(*pgd)) { + unsigned long new_p4d_page = __get_free_page(gfp); + if (WARN_ON_ONCE(!new_p4d_page)) + return NULL; + + set_pgd(pgd, __pgd(_KERNPG_TABLE | __pa(new_p4d_page))); + } + BUILD_BUG_ON(pgd_large(*pgd) != 0); + + return p4d_offset(pgd, address); +} + +/* + * Walk the user copy of the page tables (optionally) trying to allocate + * page table pages on the way down. + * + * Returns a pointer to a PMD on success, or NULL on failure. + */ +static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address) +{ + gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); + p4d_t *p4d; + pud_t *pud; + + p4d = pti_user_pagetable_walk_p4d(address); + if (!p4d) + return NULL; + + BUILD_BUG_ON(p4d_large(*p4d) != 0); + if (p4d_none(*p4d)) { + unsigned long new_pud_page = __get_free_page(gfp); + if (WARN_ON_ONCE(!new_pud_page)) + return NULL; + + set_p4d(p4d, __p4d(_KERNPG_TABLE | __pa(new_pud_page))); + } + + pud = pud_offset(p4d, address); + /* The user page tables do not use large mappings: */ + if (pud_large(*pud)) { + WARN_ON(1); + return NULL; + } + if (pud_none(*pud)) { + unsigned long new_pmd_page = __get_free_page(gfp); + if (WARN_ON_ONCE(!new_pmd_page)) + return NULL; + + set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page))); + } + + return pmd_offset(pud, address); +} + +/* + * Walk the shadow copy of the page tables (optionally) trying to allocate + * page table pages on the way down. Does not support large pages. + * + * Note: this is only used when mapping *new* kernel data into the + * user/shadow page tables. It is never used for userspace data. + * + * Returns a pointer to a PTE on success, or NULL on failure. + */ +static pte_t *pti_user_pagetable_walk_pte(unsigned long address) +{ + gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); + pmd_t *pmd; + pte_t *pte; + + pmd = pti_user_pagetable_walk_pmd(address); + if (!pmd) + return NULL; + + /* We can't do anything sensible if we hit a large mapping. */ + if (pmd_large(*pmd)) { + WARN_ON(1); + return NULL; + } + + if (pmd_none(*pmd)) { + unsigned long new_pte_page = __get_free_page(gfp); + if (!new_pte_page) + return NULL; + + set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page))); + } + + pte = pte_offset_kernel(pmd, address); + if (pte_flags(*pte) & _PAGE_USER) { + WARN_ONCE(1, "attempt to walk to user pte\n"); + return NULL; + } + return pte; +} + +#ifdef CONFIG_X86_VSYSCALL_EMULATION +static void __init pti_setup_vsyscall(void) +{ + pte_t *pte, *target_pte; + unsigned int level; + + pte = lookup_address(VSYSCALL_ADDR, &level); + if (!pte || WARN_ON(level != PG_LEVEL_4K) || pte_none(*pte)) + return; + + target_pte = pti_user_pagetable_walk_pte(VSYSCALL_ADDR); + if (WARN_ON(!target_pte)) + return; + + *target_pte = *pte; + set_vsyscall_pgtable_user_bits(kernel_to_user_pgdp(swapper_pg_dir)); +} +#else +static void __init pti_setup_vsyscall(void) { } +#endif + +enum pti_clone_level { + PTI_CLONE_PMD, + PTI_CLONE_PTE, +}; + +static void +pti_clone_pgtable(unsigned long start, unsigned long end, + enum pti_clone_level level) +{ + unsigned long addr; + + /* + * Clone the populated PMDs which cover start to end. These PMD areas + * can have holes. + */ + for (addr = start; addr < end;) { + pte_t *pte, *target_pte; + pmd_t *pmd, *target_pmd; + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + + /* Overflow check */ + if (addr < start) + break; + + pgd = pgd_offset_k(addr); + if (WARN_ON(pgd_none(*pgd))) + return; + p4d = p4d_offset(pgd, addr); + if (WARN_ON(p4d_none(*p4d))) + return; + + pud = pud_offset(p4d, addr); + if (pud_none(*pud)) { + WARN_ON_ONCE(addr & ~PUD_MASK); + addr = round_up(addr + 1, PUD_SIZE); + continue; + } + + pmd = pmd_offset(pud, addr); + if (pmd_none(*pmd)) { + WARN_ON_ONCE(addr & ~PMD_MASK); + addr = round_up(addr + 1, PMD_SIZE); + continue; + } + + if (pmd_large(*pmd) || level == PTI_CLONE_PMD) { + target_pmd = pti_user_pagetable_walk_pmd(addr); + if (WARN_ON(!target_pmd)) + return; + + /* + * Only clone present PMDs. This ensures only setting + * _PAGE_GLOBAL on present PMDs. This should only be + * called on well-known addresses anyway, so a non- + * present PMD would be a surprise. + */ + if (WARN_ON(!(pmd_flags(*pmd) & _PAGE_PRESENT))) + return; + + /* + * Setting 'target_pmd' below creates a mapping in both + * the user and kernel page tables. It is effectively + * global, so set it as global in both copies. Note: + * the X86_FEATURE_PGE check is not _required_ because + * the CPU ignores _PAGE_GLOBAL when PGE is not + * supported. The check keeps consistentency with + * code that only set this bit when supported. + */ + if (boot_cpu_has(X86_FEATURE_PGE)) + *pmd = pmd_set_flags(*pmd, _PAGE_GLOBAL); + + /* + * Copy the PMD. That is, the kernelmode and usermode + * tables will share the last-level page tables of this + * address range + */ + *target_pmd = *pmd; + + addr += PMD_SIZE; + + } else if (level == PTI_CLONE_PTE) { + + /* Walk the page-table down to the pte level */ + pte = pte_offset_kernel(pmd, addr); + if (pte_none(*pte)) { + addr += PAGE_SIZE; + continue; + } + + /* Only clone present PTEs */ + if (WARN_ON(!(pte_flags(*pte) & _PAGE_PRESENT))) + return; + + /* Allocate PTE in the user page-table */ + target_pte = pti_user_pagetable_walk_pte(addr); + if (WARN_ON(!target_pte)) + return; + + /* Set GLOBAL bit in both PTEs */ + if (boot_cpu_has(X86_FEATURE_PGE)) + *pte = pte_set_flags(*pte, _PAGE_GLOBAL); + + /* Clone the PTE */ + *target_pte = *pte; + + addr += PAGE_SIZE; + + } else { + BUG(); + } + } +} + +#ifdef CONFIG_X86_64 +/* + * Clone a single p4d (i.e. a top-level entry on 4-level systems and a + * next-level entry on 5-level systems. + */ +static void __init pti_clone_p4d(unsigned long addr) +{ + p4d_t *kernel_p4d, *user_p4d; + pgd_t *kernel_pgd; + + user_p4d = pti_user_pagetable_walk_p4d(addr); + if (!user_p4d) + return; + + kernel_pgd = pgd_offset_k(addr); + kernel_p4d = p4d_offset(kernel_pgd, addr); + *user_p4d = *kernel_p4d; +} + +/* + * Clone the CPU_ENTRY_AREA into the user space visible page table. + */ +static void __init pti_clone_user_shared(void) +{ + pti_clone_p4d(CPU_ENTRY_AREA_BASE); +} + +#else /* CONFIG_X86_64 */ + +/* + * On 32 bit PAE systems with 1GB of Kernel address space there is only + * one pgd/p4d for the whole kernel. Cloning that would map the whole + * address space into the user page-tables, making PTI useless. So clone + * the page-table on the PMD level to prevent that. + */ +static void __init pti_clone_user_shared(void) +{ + unsigned long start, end; + + start = CPU_ENTRY_AREA_BASE; + end = start + (PAGE_SIZE * CPU_ENTRY_AREA_PAGES); + + pti_clone_pgtable(start, end, PTI_CLONE_PMD); +} +#endif /* CONFIG_X86_64 */ + +/* + * Clone the ESPFIX P4D into the user space visible page table + */ +static void __init pti_setup_espfix64(void) +{ +#ifdef CONFIG_X86_ESPFIX64 + pti_clone_p4d(ESPFIX_BASE_ADDR); +#endif +} + +/* + * Clone the populated PMDs of the entry and irqentry text and force it RO. + */ +static void pti_clone_entry_text(void) +{ + pti_clone_pgtable((unsigned long) __entry_text_start, + (unsigned long) __irqentry_text_end, + PTI_CLONE_PMD); +} + +/* + * Global pages and PCIDs are both ways to make kernel TLB entries + * live longer, reduce TLB misses and improve kernel performance. + * But, leaving all kernel text Global makes it potentially accessible + * to Meltdown-style attacks which make it trivial to find gadgets or + * defeat KASLR. + * + * Only use global pages when it is really worth it. + */ +static inline bool pti_kernel_image_global_ok(void) +{ + /* + * Systems with PCIDs get litlle benefit from global + * kernel text and are not worth the downsides. + */ + if (cpu_feature_enabled(X86_FEATURE_PCID)) + return false; + + /* + * Only do global kernel image for pti=auto. Do the most + * secure thing (not global) if pti=on specified. + */ + if (pti_mode != PTI_AUTO) + return false; + + /* + * K8 may not tolerate the cleared _PAGE_RW on the userspace + * global kernel image pages. Do the safe thing (disable + * global kernel image). This is unlikely to ever be + * noticed because PTI is disabled by default on AMD CPUs. + */ + if (boot_cpu_has(X86_FEATURE_K8)) + return false; + + /* + * RANDSTRUCT derives its hardening benefits from the + * attacker's lack of knowledge about the layout of kernel + * data structures. Keep the kernel image non-global in + * cases where RANDSTRUCT is in use to help keep the layout a + * secret. + */ + if (IS_ENABLED(CONFIG_GCC_PLUGIN_RANDSTRUCT)) + return false; + + return true; +} + +/* + * This is the only user for these and it is not arch-generic + * like the other set_memory.h functions. Just extern them. + */ +extern int set_memory_nonglobal(unsigned long addr, int numpages); +extern int set_memory_global(unsigned long addr, int numpages); + +/* + * For some configurations, map all of kernel text into the user page + * tables. This reduces TLB misses, especially on non-PCID systems. + */ +static void pti_clone_kernel_text(void) +{ + /* + * rodata is part of the kernel image and is normally + * readable on the filesystem or on the web. But, do not + * clone the areas past rodata, they might contain secrets. + */ + unsigned long start = PFN_ALIGN(_text); + unsigned long end_clone = (unsigned long)__end_rodata_aligned; + unsigned long end_global = PFN_ALIGN((unsigned long)__stop___ex_table); + + if (!pti_kernel_image_global_ok()) + return; + + pr_debug("mapping partial kernel image into user address space\n"); + + /* + * Note that this will undo _some_ of the work that + * pti_set_kernel_image_nonglobal() did to clear the + * global bit. + */ + pti_clone_pgtable(start, end_clone, PTI_LEVEL_KERNEL_IMAGE); + + /* + * pti_clone_pgtable() will set the global bit in any PMDs + * that it clones, but we also need to get any PTEs in + * the last level for areas that are not huge-page-aligned. + */ + + /* Set the global bit for normal non-__init kernel text: */ + set_memory_global(start, (end_global - start) >> PAGE_SHIFT); +} + +void pti_set_kernel_image_nonglobal(void) +{ + /* + * The identity map is created with PMDs, regardless of the + * actual length of the kernel. We need to clear + * _PAGE_GLOBAL up to a PMD boundary, not just to the end + * of the image. + */ + unsigned long start = PFN_ALIGN(_text); + unsigned long end = ALIGN((unsigned long)_end, PMD_PAGE_SIZE); + + /* + * This clears _PAGE_GLOBAL from the entire kernel image. + * pti_clone_kernel_text() map put _PAGE_GLOBAL back for + * areas that are mapped to userspace. + */ + set_memory_nonglobal(start, (end - start) >> PAGE_SHIFT); +} + +/* + * Initialize kernel page table isolation + */ +void __init pti_init(void) +{ + if (!static_cpu_has(X86_FEATURE_PTI)) + return; + + pr_info("enabled\n"); + +#ifdef CONFIG_X86_32 + /* + * We check for X86_FEATURE_PCID here. But the init-code will + * clear the feature flag on 32 bit because the feature is not + * supported on 32 bit anyway. To print the warning we need to + * check with cpuid directly again. + */ + if (cpuid_ecx(0x1) & BIT(17)) { + /* Use printk to work around pr_fmt() */ + printk(KERN_WARNING "\n"); + printk(KERN_WARNING "************************************************************\n"); + printk(KERN_WARNING "** WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! **\n"); + printk(KERN_WARNING "** **\n"); + printk(KERN_WARNING "** You are using 32-bit PTI on a 64-bit PCID-capable CPU. **\n"); + printk(KERN_WARNING "** Your performance will increase dramatically if you **\n"); + printk(KERN_WARNING "** switch to a 64-bit kernel! **\n"); + printk(KERN_WARNING "** **\n"); + printk(KERN_WARNING "** WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! **\n"); + printk(KERN_WARNING "************************************************************\n"); + } +#endif + + pti_clone_user_shared(); + + /* Undo all global bits from the init pagetables in head_64.S: */ + pti_set_kernel_image_nonglobal(); + /* Replace some of the global bits just for shared entry text: */ + pti_clone_entry_text(); + pti_setup_espfix64(); + pti_setup_vsyscall(); +} + +/* + * Finalize the kernel mappings in the userspace page-table. Some of the + * mappings for the kernel image might have changed since pti_init() + * cloned them. This is because parts of the kernel image have been + * mapped RO and/or NX. These changes need to be cloned again to the + * userspace page-table. + */ +void pti_finalize(void) +{ + if (!boot_cpu_has(X86_FEATURE_PTI)) + return; + /* + * We need to clone everything (again) that maps parts of the + * kernel image. + */ + pti_clone_entry_text(); + pti_clone_kernel_text(); + + debug_checkwx_user(); +} diff --git a/arch/x86/mm/setup_nx.c b/arch/x86/mm/setup_nx.c new file mode 100644 index 000000000..adb3c5784 --- /dev/null +++ b/arch/x86/mm/setup_nx.c @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/spinlock.h> +#include <linux/errno.h> +#include <linux/init.h> + +#include <asm/pgtable.h> +#include <asm/proto.h> +#include <asm/cpufeature.h> + +static int disable_nx; + +/* + * noexec = on|off + * + * Control non-executable mappings for processes. + * + * on Enable + * off Disable + */ +static int __init noexec_setup(char *str) +{ + if (!str) + return -EINVAL; + if (!strncmp(str, "on", 2)) { + disable_nx = 0; + } else if (!strncmp(str, "off", 3)) { + disable_nx = 1; + } + x86_configure_nx(); + return 0; +} +early_param("noexec", noexec_setup); + +void x86_configure_nx(void) +{ + if (boot_cpu_has(X86_FEATURE_NX) && !disable_nx) + __supported_pte_mask |= _PAGE_NX; + else + __supported_pte_mask &= ~_PAGE_NX; +} + +void __init x86_report_nx(void) +{ + if (!boot_cpu_has(X86_FEATURE_NX)) { + printk(KERN_NOTICE "Notice: NX (Execute Disable) protection " + "missing in CPU!\n"); + } else { +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) + if (disable_nx) { + printk(KERN_INFO "NX (Execute Disable) protection: " + "disabled by kernel command line option\n"); + } else { + printk(KERN_INFO "NX (Execute Disable) protection: " + "active\n"); + } +#else + /* 32bit non-PAE kernel, NX cannot be used */ + printk(KERN_NOTICE "Notice: NX (Execute Disable) protection " + "cannot be enabled: non-PAE kernel!\n"); +#endif + } +} diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c new file mode 100644 index 000000000..dac07e4f5 --- /dev/null +++ b/arch/x86/mm/srat.c @@ -0,0 +1,114 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * ACPI 3.0 based NUMA setup + * Copyright 2004 Andi Kleen, SuSE Labs. + * + * Reads the ACPI SRAT table to figure out what memory belongs to which CPUs. + * + * Called from acpi_numa_init while reading the SRAT and SLIT tables. + * Assumes all memory regions belonging to a single proximity domain + * are in one chunk. Holes between them will be included in the node. + */ + +#include <linux/kernel.h> +#include <linux/acpi.h> +#include <linux/mmzone.h> +#include <linux/bitmap.h> +#include <linux/init.h> +#include <linux/topology.h> +#include <linux/mm.h> +#include <asm/proto.h> +#include <asm/numa.h> +#include <asm/e820/api.h> +#include <asm/apic.h> +#include <asm/uv/uv.h> + +/* Callback for Proximity Domain -> x2APIC mapping */ +void __init +acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa) +{ + int pxm, node; + int apic_id; + + if (srat_disabled()) + return; + if (pa->header.length < sizeof(struct acpi_srat_x2apic_cpu_affinity)) { + bad_srat(); + return; + } + if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0) + return; + pxm = pa->proximity_domain; + apic_id = pa->apic_id; + if (!apic->apic_id_valid(apic_id)) { + printk(KERN_INFO "SRAT: PXM %u -> X2APIC 0x%04x ignored\n", + pxm, apic_id); + return; + } + node = acpi_map_pxm_to_node(pxm); + if (node < 0) { + printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm); + bad_srat(); + return; + } + + if (apic_id >= MAX_LOCAL_APIC) { + printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node); + return; + } + set_apicid_to_node(apic_id, node); + node_set(node, numa_nodes_parsed); + printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u\n", + pxm, apic_id, node); +} + +/* Callback for Proximity Domain -> LAPIC mapping */ +void __init +acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa) +{ + int pxm, node; + int apic_id; + + if (srat_disabled()) + return; + if (pa->header.length != sizeof(struct acpi_srat_cpu_affinity)) { + bad_srat(); + return; + } + if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0) + return; + pxm = pa->proximity_domain_lo; + if (acpi_srat_revision >= 2) + pxm |= *((unsigned int*)pa->proximity_domain_hi) << 8; + node = acpi_map_pxm_to_node(pxm); + if (node < 0) { + printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm); + bad_srat(); + return; + } + + if (get_uv_system_type() >= UV_X2APIC) + apic_id = (pa->apic_id << 8) | pa->local_sapic_eid; + else + apic_id = pa->apic_id; + + if (apic_id >= MAX_LOCAL_APIC) { + printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node); + return; + } + + set_apicid_to_node(apic_id, node); + node_set(node, numa_nodes_parsed); + printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u\n", + pxm, apic_id, node); +} + +int __init x86_acpi_numa_init(void) +{ + int ret; + + ret = acpi_numa_init(); + if (ret < 0) + return ret; + return srat_disabled() ? -EINVAL : 0; +} diff --git a/arch/x86/mm/testmmiotrace.c b/arch/x86/mm/testmmiotrace.c new file mode 100644 index 000000000..f6ae6830b --- /dev/null +++ b/arch/x86/mm/testmmiotrace.c @@ -0,0 +1,140 @@ +/* + * Written by Pekka Paalanen, 2008-2009 <pq@iki.fi> + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/module.h> +#include <linux/io.h> +#include <linux/mmiotrace.h> + +static unsigned long mmio_address; +module_param_hw(mmio_address, ulong, iomem, 0); +MODULE_PARM_DESC(mmio_address, " Start address of the mapping of 16 kB " + "(or 8 MB if read_far is non-zero)."); + +static unsigned long read_far = 0x400100; +module_param(read_far, ulong, 0); +MODULE_PARM_DESC(read_far, " Offset of a 32-bit read within 8 MB " + "(default: 0x400100)."); + +static unsigned v16(unsigned i) +{ + return i * 12 + 7; +} + +static unsigned v32(unsigned i) +{ + return i * 212371 + 13; +} + +static void do_write_test(void __iomem *p) +{ + unsigned int i; + pr_info("write test.\n"); + mmiotrace_printk("Write test.\n"); + + for (i = 0; i < 256; i++) + iowrite8(i, p + i); + + for (i = 1024; i < (5 * 1024); i += 2) + iowrite16(v16(i), p + i); + + for (i = (5 * 1024); i < (16 * 1024); i += 4) + iowrite32(v32(i), p + i); +} + +static void do_read_test(void __iomem *p) +{ + unsigned int i; + unsigned errs[3] = { 0 }; + pr_info("read test.\n"); + mmiotrace_printk("Read test.\n"); + + for (i = 0; i < 256; i++) + if (ioread8(p + i) != i) + ++errs[0]; + + for (i = 1024; i < (5 * 1024); i += 2) + if (ioread16(p + i) != v16(i)) + ++errs[1]; + + for (i = (5 * 1024); i < (16 * 1024); i += 4) + if (ioread32(p + i) != v32(i)) + ++errs[2]; + + mmiotrace_printk("Read errors: 8-bit %d, 16-bit %d, 32-bit %d.\n", + errs[0], errs[1], errs[2]); +} + +static void do_read_far_test(void __iomem *p) +{ + pr_info("read far test.\n"); + mmiotrace_printk("Read far test.\n"); + + ioread32(p + read_far); +} + +static void do_test(unsigned long size) +{ + void __iomem *p = ioremap_nocache(mmio_address, size); + if (!p) { + pr_err("could not ioremap, aborting.\n"); + return; + } + mmiotrace_printk("ioremap returned %p.\n", p); + do_write_test(p); + do_read_test(p); + if (read_far && read_far < size - 4) + do_read_far_test(p); + iounmap(p); +} + +/* + * Tests how mmiotrace behaves in face of multiple ioremap / iounmaps in + * a short time. We had a bug in deferred freeing procedure which tried + * to free this region multiple times (ioremap can reuse the same address + * for many mappings). + */ +static void do_test_bulk_ioremapping(void) +{ + void __iomem *p; + int i; + + for (i = 0; i < 10; ++i) { + p = ioremap_nocache(mmio_address, PAGE_SIZE); + if (p) + iounmap(p); + } + + /* Force freeing. If it will crash we will know why. */ + synchronize_rcu(); +} + +static int __init init(void) +{ + unsigned long size = (read_far) ? (8 << 20) : (16 << 10); + + if (mmio_address == 0) { + pr_err("you have to use the module argument mmio_address.\n"); + pr_err("DO NOT LOAD THIS MODULE UNLESS YOU REALLY KNOW WHAT YOU ARE DOING!\n"); + return -ENXIO; + } + + pr_warning("WARNING: mapping %lu kB @ 0x%08lx in PCI address space, " + "and writing 16 kB of rubbish in there.\n", + size >> 10, mmio_address); + do_test(size); + do_test_bulk_ioremapping(); + pr_info("All done.\n"); + return 0; +} + +static void __exit cleanup(void) +{ + pr_debug("unloaded.\n"); +} + +module_init(init); +module_exit(cleanup); +MODULE_LICENSE("GPL"); diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c new file mode 100644 index 000000000..2f41a34c8 --- /dev/null +++ b/arch/x86/mm/tlb.c @@ -0,0 +1,840 @@ +#include <linux/init.h> + +#include <linux/mm.h> +#include <linux/spinlock.h> +#include <linux/smp.h> +#include <linux/interrupt.h> +#include <linux/export.h> +#include <linux/cpu.h> +#include <linux/debugfs.h> + +#include <asm/tlbflush.h> +#include <asm/mmu_context.h> +#include <asm/nospec-branch.h> +#include <asm/cache.h> +#include <asm/apic.h> +#include <asm/uv/uv.h> + +/* + * TLB flushing, formerly SMP-only + * c/o Linus Torvalds. + * + * These mean you can really definitely utterly forget about + * writing to user space from interrupts. (Its not allowed anyway). + * + * Optimizations Manfred Spraul <manfred@colorfullife.com> + * + * More scalable flush, from Andi Kleen + * + * Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi + */ + +/* + * Use bit 0 to mangle the TIF_SPEC_IB state into the mm pointer which is + * stored in cpu_tlb_state.last_user_mm_ibpb. + */ +#define LAST_USER_MM_IBPB 0x1UL + +/* + * We get here when we do something requiring a TLB invalidation + * but could not go invalidate all of the contexts. We do the + * necessary invalidation by clearing out the 'ctx_id' which + * forces a TLB flush when the context is loaded. + */ +static void clear_asid_other(void) +{ + u16 asid; + + /* + * This is only expected to be set if we have disabled + * kernel _PAGE_GLOBAL pages. + */ + if (!static_cpu_has(X86_FEATURE_PTI)) { + WARN_ON_ONCE(1); + return; + } + + for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) { + /* Do not need to flush the current asid */ + if (asid == this_cpu_read(cpu_tlbstate.loaded_mm_asid)) + continue; + /* + * Make sure the next time we go to switch to + * this asid, we do a flush: + */ + this_cpu_write(cpu_tlbstate.ctxs[asid].ctx_id, 0); + } + this_cpu_write(cpu_tlbstate.invalidate_other, false); +} + +atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1); + + +static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen, + u16 *new_asid, bool *need_flush) +{ + u16 asid; + + if (!static_cpu_has(X86_FEATURE_PCID)) { + *new_asid = 0; + *need_flush = true; + return; + } + + if (this_cpu_read(cpu_tlbstate.invalidate_other)) + clear_asid_other(); + + for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) { + if (this_cpu_read(cpu_tlbstate.ctxs[asid].ctx_id) != + next->context.ctx_id) + continue; + + *new_asid = asid; + *need_flush = (this_cpu_read(cpu_tlbstate.ctxs[asid].tlb_gen) < + next_tlb_gen); + return; + } + + /* + * We don't currently own an ASID slot on this CPU. + * Allocate a slot. + */ + *new_asid = this_cpu_add_return(cpu_tlbstate.next_asid, 1) - 1; + if (*new_asid >= TLB_NR_DYN_ASIDS) { + *new_asid = 0; + this_cpu_write(cpu_tlbstate.next_asid, 1); + } + *need_flush = true; +} + +static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, bool need_flush) +{ + unsigned long new_mm_cr3; + + if (need_flush) { + invalidate_user_asid(new_asid); + new_mm_cr3 = build_cr3(pgdir, new_asid); + } else { + new_mm_cr3 = build_cr3_noflush(pgdir, new_asid); + } + + /* + * Caution: many callers of this function expect + * that load_cr3() is serializing and orders TLB + * fills with respect to the mm_cpumask writes. + */ + write_cr3(new_mm_cr3); +} + +void leave_mm(int cpu) +{ + struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm); + + /* + * It's plausible that we're in lazy TLB mode while our mm is init_mm. + * If so, our callers still expect us to flush the TLB, but there + * aren't any user TLB entries in init_mm to worry about. + * + * This needs to happen before any other sanity checks due to + * intel_idle's shenanigans. + */ + if (loaded_mm == &init_mm) + return; + + /* Warn if we're not lazy. */ + WARN_ON(!this_cpu_read(cpu_tlbstate.is_lazy)); + + switch_mm(NULL, &init_mm, NULL); +} +EXPORT_SYMBOL_GPL(leave_mm); + +void switch_mm(struct mm_struct *prev, struct mm_struct *next, + struct task_struct *tsk) +{ + unsigned long flags; + + local_irq_save(flags); + switch_mm_irqs_off(prev, next, tsk); + local_irq_restore(flags); +} + +static void sync_current_stack_to_mm(struct mm_struct *mm) +{ + unsigned long sp = current_stack_pointer; + pgd_t *pgd = pgd_offset(mm, sp); + + if (pgtable_l5_enabled()) { + if (unlikely(pgd_none(*pgd))) { + pgd_t *pgd_ref = pgd_offset_k(sp); + + set_pgd(pgd, *pgd_ref); + } + } else { + /* + * "pgd" is faked. The top level entries are "p4d"s, so sync + * the p4d. This compiles to approximately the same code as + * the 5-level case. + */ + p4d_t *p4d = p4d_offset(pgd, sp); + + if (unlikely(p4d_none(*p4d))) { + pgd_t *pgd_ref = pgd_offset_k(sp); + p4d_t *p4d_ref = p4d_offset(pgd_ref, sp); + + set_p4d(p4d, *p4d_ref); + } + } +} + +static inline unsigned long mm_mangle_tif_spec_ib(struct task_struct *next) +{ + unsigned long next_tif = task_thread_info(next)->flags; + unsigned long ibpb = (next_tif >> TIF_SPEC_IB) & LAST_USER_MM_IBPB; + + return (unsigned long)next->mm | ibpb; +} + +static void cond_ibpb(struct task_struct *next) +{ + if (!next || !next->mm) + return; + + /* + * Both, the conditional and the always IBPB mode use the mm + * pointer to avoid the IBPB when switching between tasks of the + * same process. Using the mm pointer instead of mm->context.ctx_id + * opens a hypothetical hole vs. mm_struct reuse, which is more or + * less impossible to control by an attacker. Aside of that it + * would only affect the first schedule so the theoretically + * exposed data is not really interesting. + */ + if (static_branch_likely(&switch_mm_cond_ibpb)) { + unsigned long prev_mm, next_mm; + + /* + * This is a bit more complex than the always mode because + * it has to handle two cases: + * + * 1) Switch from a user space task (potential attacker) + * which has TIF_SPEC_IB set to a user space task + * (potential victim) which has TIF_SPEC_IB not set. + * + * 2) Switch from a user space task (potential attacker) + * which has TIF_SPEC_IB not set to a user space task + * (potential victim) which has TIF_SPEC_IB set. + * + * This could be done by unconditionally issuing IBPB when + * a task which has TIF_SPEC_IB set is either scheduled in + * or out. Though that results in two flushes when: + * + * - the same user space task is scheduled out and later + * scheduled in again and only a kernel thread ran in + * between. + * + * - a user space task belonging to the same process is + * scheduled in after a kernel thread ran in between + * + * - a user space task belonging to the same process is + * scheduled in immediately. + * + * Optimize this with reasonably small overhead for the + * above cases. Mangle the TIF_SPEC_IB bit into the mm + * pointer of the incoming task which is stored in + * cpu_tlbstate.last_user_mm_ibpb for comparison. + */ + next_mm = mm_mangle_tif_spec_ib(next); + prev_mm = this_cpu_read(cpu_tlbstate.last_user_mm_ibpb); + + /* + * Issue IBPB only if the mm's are different and one or + * both have the IBPB bit set. + */ + if (next_mm != prev_mm && + (next_mm | prev_mm) & LAST_USER_MM_IBPB) + indirect_branch_prediction_barrier(); + + this_cpu_write(cpu_tlbstate.last_user_mm_ibpb, next_mm); + } + + if (static_branch_unlikely(&switch_mm_always_ibpb)) { + /* + * Only flush when switching to a user space task with a + * different context than the user space task which ran + * last on this CPU. + */ + if (this_cpu_read(cpu_tlbstate.last_user_mm) != next->mm) { + indirect_branch_prediction_barrier(); + this_cpu_write(cpu_tlbstate.last_user_mm, next->mm); + } + } +} + +void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, + struct task_struct *tsk) +{ + struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm); + u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); + unsigned cpu = smp_processor_id(); + u64 next_tlb_gen; + + /* + * NB: The scheduler will call us with prev == next when switching + * from lazy TLB mode to normal mode if active_mm isn't changing. + * When this happens, we don't assume that CR3 (and hence + * cpu_tlbstate.loaded_mm) matches next. + * + * NB: leave_mm() calls us with prev == NULL and tsk == NULL. + */ + + /* We don't want flush_tlb_func_* to run concurrently with us. */ + if (IS_ENABLED(CONFIG_PROVE_LOCKING)) + WARN_ON_ONCE(!irqs_disabled()); + + /* + * Verify that CR3 is what we think it is. This will catch + * hypothetical buggy code that directly switches to swapper_pg_dir + * without going through leave_mm() / switch_mm_irqs_off() or that + * does something like write_cr3(read_cr3_pa()). + * + * Only do this check if CONFIG_DEBUG_VM=y because __read_cr3() + * isn't free. + */ +#ifdef CONFIG_DEBUG_VM + if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev->pgd, prev_asid))) { + /* + * If we were to BUG here, we'd be very likely to kill + * the system so hard that we don't see the call trace. + * Try to recover instead by ignoring the error and doing + * a global flush to minimize the chance of corruption. + * + * (This is far from being a fully correct recovery. + * Architecturally, the CPU could prefetch something + * back into an incorrect ASID slot and leave it there + * to cause trouble down the road. It's better than + * nothing, though.) + */ + __flush_tlb_all(); + } +#endif + this_cpu_write(cpu_tlbstate.is_lazy, false); + + /* + * The membarrier system call requires a full memory barrier and + * core serialization before returning to user-space, after + * storing to rq->curr, when changing mm. This is because + * membarrier() sends IPIs to all CPUs that are in the target mm + * to make them issue memory barriers. However, if another CPU + * switches to/from the target mm concurrently with + * membarrier(), it can cause that CPU not to receive an IPI + * when it really should issue a memory barrier. Writing to CR3 + * provides that full memory barrier and core serializing + * instruction. + */ + if (real_prev == next) { + VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) != + next->context.ctx_id); + + /* + * We don't currently support having a real mm loaded without + * our cpu set in mm_cpumask(). We have all the bookkeeping + * in place to figure out whether we would need to flush + * if our cpu were cleared in mm_cpumask(), but we don't + * currently use it. + */ + if (WARN_ON_ONCE(real_prev != &init_mm && + !cpumask_test_cpu(cpu, mm_cpumask(next)))) + cpumask_set_cpu(cpu, mm_cpumask(next)); + + return; + } else { + u16 new_asid; + bool need_flush; + + /* + * Avoid user/user BTB poisoning by flushing the branch + * predictor when switching between processes. This stops + * one process from doing Spectre-v2 attacks on another. + */ + cond_ibpb(tsk); + + if (IS_ENABLED(CONFIG_VMAP_STACK)) { + /* + * If our current stack is in vmalloc space and isn't + * mapped in the new pgd, we'll double-fault. Forcibly + * map it. + */ + sync_current_stack_to_mm(next); + } + + /* + * Stop remote flushes for the previous mm. + * Skip kernel threads; we never send init_mm TLB flushing IPIs, + * but the bitmap manipulation can cause cache line contention. + */ + if (real_prev != &init_mm) { + VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu, + mm_cpumask(real_prev))); + cpumask_clear_cpu(cpu, mm_cpumask(real_prev)); + } + + /* + * Start remote flushes and then read tlb_gen. + */ + if (next != &init_mm) + cpumask_set_cpu(cpu, mm_cpumask(next)); + next_tlb_gen = atomic64_read(&next->context.tlb_gen); + + choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush); + + /* Let nmi_uaccess_okay() know that we're changing CR3. */ + this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING); + barrier(); + + if (need_flush) { + this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); + this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); + load_new_mm_cr3(next->pgd, new_asid, true); + + /* + * NB: This gets called via leave_mm() in the idle path + * where RCU functions differently. Tracing normally + * uses RCU, so we need to use the _rcuidle variant. + * + * (There is no good reason for this. The idle code should + * be rearranged to call this before rcu_idle_enter().) + */ + trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); + } else { + /* The new ASID is already up to date. */ + load_new_mm_cr3(next->pgd, new_asid, false); + + /* See above wrt _rcuidle. */ + trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0); + } + + /* Make sure we write CR3 before loaded_mm. */ + barrier(); + + this_cpu_write(cpu_tlbstate.loaded_mm, next); + this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid); + } + + load_mm_cr4(next); + switch_ldt(real_prev, next); +} + +/* + * Please ignore the name of this function. It should be called + * switch_to_kernel_thread(). + * + * enter_lazy_tlb() is a hint from the scheduler that we are entering a + * kernel thread or other context without an mm. Acceptable implementations + * include doing nothing whatsoever, switching to init_mm, or various clever + * lazy tricks to try to minimize TLB flushes. + * + * The scheduler reserves the right to call enter_lazy_tlb() several times + * in a row. It will notify us that we're going back to a real mm by + * calling switch_mm_irqs_off(). + */ +void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) +{ + if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm) + return; + + if (tlb_defer_switch_to_init_mm()) { + /* + * There's a significant optimization that may be possible + * here. We have accurate enough TLB flush tracking that we + * don't need to maintain coherence of TLB per se when we're + * lazy. We do, however, need to maintain coherence of + * paging-structure caches. We could, in principle, leave our + * old mm loaded and only switch to init_mm when + * tlb_remove_page() happens. + */ + this_cpu_write(cpu_tlbstate.is_lazy, true); + } else { + switch_mm(NULL, &init_mm, NULL); + } +} + +/* + * Call this when reinitializing a CPU. It fixes the following potential + * problems: + * + * - The ASID changed from what cpu_tlbstate thinks it is (most likely + * because the CPU was taken down and came back up with CR3's PCID + * bits clear. CPU hotplug can do this. + * + * - The TLB contains junk in slots corresponding to inactive ASIDs. + * + * - The CPU went so far out to lunch that it may have missed a TLB + * flush. + */ +void initialize_tlbstate_and_flush(void) +{ + int i; + struct mm_struct *mm = this_cpu_read(cpu_tlbstate.loaded_mm); + u64 tlb_gen = atomic64_read(&init_mm.context.tlb_gen); + unsigned long cr3 = __read_cr3(); + + /* Assert that CR3 already references the right mm. */ + WARN_ON((cr3 & CR3_ADDR_MASK) != __pa(mm->pgd)); + + /* + * Assert that CR4.PCIDE is set if needed. (CR4.PCIDE initialization + * doesn't work like other CR4 bits because it can only be set from + * long mode.) + */ + WARN_ON(boot_cpu_has(X86_FEATURE_PCID) && + !(cr4_read_shadow() & X86_CR4_PCIDE)); + + /* Force ASID 0 and force a TLB flush. */ + write_cr3(build_cr3(mm->pgd, 0)); + + /* Reinitialize tlbstate. */ + this_cpu_write(cpu_tlbstate.last_user_mm_ibpb, LAST_USER_MM_IBPB); + this_cpu_write(cpu_tlbstate.loaded_mm_asid, 0); + this_cpu_write(cpu_tlbstate.next_asid, 1); + this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, mm->context.ctx_id); + this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, tlb_gen); + + for (i = 1; i < TLB_NR_DYN_ASIDS; i++) + this_cpu_write(cpu_tlbstate.ctxs[i].ctx_id, 0); +} + +/* + * flush_tlb_func_common()'s memory ordering requirement is that any + * TLB fills that happen after we flush the TLB are ordered after we + * read active_mm's tlb_gen. We don't need any explicit barriers + * because all x86 flush operations are serializing and the + * atomic64_read operation won't be reordered by the compiler. + */ +static void flush_tlb_func_common(const struct flush_tlb_info *f, + bool local, enum tlb_flush_reason reason) +{ + /* + * We have three different tlb_gen values in here. They are: + * + * - mm_tlb_gen: the latest generation. + * - local_tlb_gen: the generation that this CPU has already caught + * up to. + * - f->new_tlb_gen: the generation that the requester of the flush + * wants us to catch up to. + */ + struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm); + u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); + u64 mm_tlb_gen = atomic64_read(&loaded_mm->context.tlb_gen); + u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen); + + /* This code cannot presently handle being reentered. */ + VM_WARN_ON(!irqs_disabled()); + + if (unlikely(loaded_mm == &init_mm)) + return; + + VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) != + loaded_mm->context.ctx_id); + + if (this_cpu_read(cpu_tlbstate.is_lazy)) { + /* + * We're in lazy mode. We need to at least flush our + * paging-structure cache to avoid speculatively reading + * garbage into our TLB. Since switching to init_mm is barely + * slower than a minimal flush, just switch to init_mm. + */ + switch_mm_irqs_off(NULL, &init_mm, NULL); + return; + } + + if (unlikely(local_tlb_gen == mm_tlb_gen)) { + /* + * There's nothing to do: we're already up to date. This can + * happen if two concurrent flushes happen -- the first flush to + * be handled can catch us all the way up, leaving no work for + * the second flush. + */ + trace_tlb_flush(reason, 0); + return; + } + + WARN_ON_ONCE(local_tlb_gen > mm_tlb_gen); + WARN_ON_ONCE(f->new_tlb_gen > mm_tlb_gen); + + /* + * If we get to this point, we know that our TLB is out of date. + * This does not strictly imply that we need to flush (it's + * possible that f->new_tlb_gen <= local_tlb_gen), but we're + * going to need to flush in the very near future, so we might + * as well get it over with. + * + * The only question is whether to do a full or partial flush. + * + * We do a partial flush if requested and two extra conditions + * are met: + * + * 1. f->new_tlb_gen == local_tlb_gen + 1. We have an invariant that + * we've always done all needed flushes to catch up to + * local_tlb_gen. If, for example, local_tlb_gen == 2 and + * f->new_tlb_gen == 3, then we know that the flush needed to bring + * us up to date for tlb_gen 3 is the partial flush we're + * processing. + * + * As an example of why this check is needed, suppose that there + * are two concurrent flushes. The first is a full flush that + * changes context.tlb_gen from 1 to 2. The second is a partial + * flush that changes context.tlb_gen from 2 to 3. If they get + * processed on this CPU in reverse order, we'll see + * local_tlb_gen == 1, mm_tlb_gen == 3, and end != TLB_FLUSH_ALL. + * If we were to use __flush_tlb_one_user() and set local_tlb_gen to + * 3, we'd be break the invariant: we'd update local_tlb_gen above + * 1 without the full flush that's needed for tlb_gen 2. + * + * 2. f->new_tlb_gen == mm_tlb_gen. This is purely an optimiation. + * Partial TLB flushes are not all that much cheaper than full TLB + * flushes, so it seems unlikely that it would be a performance win + * to do a partial flush if that won't bring our TLB fully up to + * date. By doing a full flush instead, we can increase + * local_tlb_gen all the way to mm_tlb_gen and we can probably + * avoid another flush in the very near future. + */ + if (f->end != TLB_FLUSH_ALL && + f->new_tlb_gen == local_tlb_gen + 1 && + f->new_tlb_gen == mm_tlb_gen) { + /* Partial flush */ + unsigned long addr; + unsigned long nr_pages = (f->end - f->start) >> PAGE_SHIFT; + + addr = f->start; + while (addr < f->end) { + __flush_tlb_one_user(addr); + addr += PAGE_SIZE; + } + if (local) + count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_pages); + trace_tlb_flush(reason, nr_pages); + } else { + /* Full flush. */ + local_flush_tlb(); + if (local) + count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); + trace_tlb_flush(reason, TLB_FLUSH_ALL); + } + + /* Both paths above update our state to mm_tlb_gen. */ + this_cpu_write(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen, mm_tlb_gen); +} + +static void flush_tlb_func_local(void *info, enum tlb_flush_reason reason) +{ + const struct flush_tlb_info *f = info; + + flush_tlb_func_common(f, true, reason); +} + +static void flush_tlb_func_remote(void *info) +{ + const struct flush_tlb_info *f = info; + + inc_irq_stat(irq_tlb_count); + + if (f->mm && f->mm != this_cpu_read(cpu_tlbstate.loaded_mm)) + return; + + count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); + flush_tlb_func_common(f, false, TLB_REMOTE_SHOOTDOWN); +} + +void native_flush_tlb_others(const struct cpumask *cpumask, + const struct flush_tlb_info *info) +{ + count_vm_tlb_event(NR_TLB_REMOTE_FLUSH); + if (info->end == TLB_FLUSH_ALL) + trace_tlb_flush(TLB_REMOTE_SEND_IPI, TLB_FLUSH_ALL); + else + trace_tlb_flush(TLB_REMOTE_SEND_IPI, + (info->end - info->start) >> PAGE_SHIFT); + + if (is_uv_system()) { + /* + * This whole special case is confused. UV has a "Broadcast + * Assist Unit", which seems to be a fancy way to send IPIs. + * Back when x86 used an explicit TLB flush IPI, UV was + * optimized to use its own mechanism. These days, x86 uses + * smp_call_function_many(), but UV still uses a manual IPI, + * and that IPI's action is out of date -- it does a manual + * flush instead of calling flush_tlb_func_remote(). This + * means that the percpu tlb_gen variables won't be updated + * and we'll do pointless flushes on future context switches. + * + * Rather than hooking native_flush_tlb_others() here, I think + * that UV should be updated so that smp_call_function_many(), + * etc, are optimal on UV. + */ + cpumask = uv_flush_tlb_others(cpumask, info); + if (cpumask) + smp_call_function_many(cpumask, flush_tlb_func_remote, + (void *)info, 1); + return; + } + smp_call_function_many(cpumask, flush_tlb_func_remote, + (void *)info, 1); +} + +/* + * See Documentation/x86/tlb.txt for details. We choose 33 + * because it is large enough to cover the vast majority (at + * least 95%) of allocations, and is small enough that we are + * confident it will not cause too much overhead. Each single + * flush is about 100 ns, so this caps the maximum overhead at + * _about_ 3,000 ns. + * + * This is in units of pages. + */ +static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33; + +void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, + unsigned long end, unsigned long vmflag) +{ + int cpu; + + struct flush_tlb_info info = { + .mm = mm, + }; + + cpu = get_cpu(); + + /* This is also a barrier that synchronizes with switch_mm(). */ + info.new_tlb_gen = inc_mm_tlb_gen(mm); + + /* Should we flush just the requested range? */ + if ((end != TLB_FLUSH_ALL) && + !(vmflag & VM_HUGETLB) && + ((end - start) >> PAGE_SHIFT) <= tlb_single_page_flush_ceiling) { + info.start = start; + info.end = end; + } else { + info.start = 0UL; + info.end = TLB_FLUSH_ALL; + } + + if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) { + VM_WARN_ON(irqs_disabled()); + local_irq_disable(); + flush_tlb_func_local(&info, TLB_LOCAL_MM_SHOOTDOWN); + local_irq_enable(); + } + + if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) + flush_tlb_others(mm_cpumask(mm), &info); + + put_cpu(); +} + + +static void do_flush_tlb_all(void *info) +{ + count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); + __flush_tlb_all(); +} + +void flush_tlb_all(void) +{ + count_vm_tlb_event(NR_TLB_REMOTE_FLUSH); + on_each_cpu(do_flush_tlb_all, NULL, 1); +} + +static void do_kernel_range_flush(void *info) +{ + struct flush_tlb_info *f = info; + unsigned long addr; + + /* flush range by one by one 'invlpg' */ + for (addr = f->start; addr < f->end; addr += PAGE_SIZE) + __flush_tlb_one_kernel(addr); +} + +void flush_tlb_kernel_range(unsigned long start, unsigned long end) +{ + + /* Balance as user space task's flush, a bit conservative */ + if (end == TLB_FLUSH_ALL || + (end - start) > tlb_single_page_flush_ceiling << PAGE_SHIFT) { + on_each_cpu(do_flush_tlb_all, NULL, 1); + } else { + struct flush_tlb_info info; + info.start = start; + info.end = end; + on_each_cpu(do_kernel_range_flush, &info, 1); + } +} + +void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch) +{ + struct flush_tlb_info info = { + .mm = NULL, + .start = 0UL, + .end = TLB_FLUSH_ALL, + }; + + int cpu = get_cpu(); + + if (cpumask_test_cpu(cpu, &batch->cpumask)) { + VM_WARN_ON(irqs_disabled()); + local_irq_disable(); + flush_tlb_func_local(&info, TLB_LOCAL_SHOOTDOWN); + local_irq_enable(); + } + + if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) + flush_tlb_others(&batch->cpumask, &info); + + cpumask_clear(&batch->cpumask); + + put_cpu(); +} + +static ssize_t tlbflush_read_file(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos) +{ + char buf[32]; + unsigned int len; + + len = sprintf(buf, "%ld\n", tlb_single_page_flush_ceiling); + return simple_read_from_buffer(user_buf, count, ppos, buf, len); +} + +static ssize_t tlbflush_write_file(struct file *file, + const char __user *user_buf, size_t count, loff_t *ppos) +{ + char buf[32]; + ssize_t len; + int ceiling; + + len = min(count, sizeof(buf) - 1); + if (copy_from_user(buf, user_buf, len)) + return -EFAULT; + + buf[len] = '\0'; + if (kstrtoint(buf, 0, &ceiling)) + return -EINVAL; + + if (ceiling < 0) + return -EINVAL; + + tlb_single_page_flush_ceiling = ceiling; + return count; +} + +static const struct file_operations fops_tlbflush = { + .read = tlbflush_read_file, + .write = tlbflush_write_file, + .llseek = default_llseek, +}; + +static int __init create_tlb_single_page_flush_ceiling(void) +{ + debugfs_create_file("tlb_single_page_flush_ceiling", S_IRUSR | S_IWUSR, + arch_debugfs_dir, NULL, &fops_tlbflush); + return 0; +} +late_initcall(create_tlb_single_page_flush_ceiling); |