diff options
Diffstat (limited to 'arch/riscv/mm')
-rw-r--r-- | arch/riscv/mm/Makefile | 39 | ||||
-rw-r--r-- | arch/riscv/mm/cache-ops.c | 17 | ||||
-rw-r--r-- | arch/riscv/mm/cacheflush.c | 141 | ||||
-rw-r--r-- | arch/riscv/mm/context.c | 336 | ||||
-rw-r--r-- | arch/riscv/mm/dma-noncoherent.c | 158 | ||||
-rw-r--r-- | arch/riscv/mm/extable.c | 71 | ||||
-rw-r--r-- | arch/riscv/mm/fault.c | 377 | ||||
-rw-r--r-- | arch/riscv/mm/hugetlbpage.c | 375 | ||||
-rw-r--r-- | arch/riscv/mm/init.c | 1566 | ||||
-rw-r--r-- | arch/riscv/mm/kasan_init.c | 525 | ||||
-rw-r--r-- | arch/riscv/mm/pageattr.c | 439 | ||||
-rw-r--r-- | arch/riscv/mm/pgtable.c | 103 | ||||
-rw-r--r-- | arch/riscv/mm/physaddr.c | 51 | ||||
-rw-r--r-- | arch/riscv/mm/pmem.c | 34 | ||||
-rw-r--r-- | arch/riscv/mm/ptdump.c | 405 | ||||
-rw-r--r-- | arch/riscv/mm/tlbflush.c | 151 |
16 files changed, 4788 insertions, 0 deletions
diff --git a/arch/riscv/mm/Makefile b/arch/riscv/mm/Makefile new file mode 100644 index 0000000000..3a4dfc8bab --- /dev/null +++ b/arch/riscv/mm/Makefile @@ -0,0 +1,39 @@ +# SPDX-License-Identifier: GPL-2.0-only + +CFLAGS_init.o := -mcmodel=medany +ifdef CONFIG_RELOCATABLE +CFLAGS_init.o += -fno-pie +endif + +ifdef CONFIG_FTRACE +CFLAGS_REMOVE_init.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_cacheflush.o = $(CC_FLAGS_FTRACE) +endif + +KCOV_INSTRUMENT_init.o := n + +obj-y += init.o +obj-$(CONFIG_MMU) += extable.o fault.o pageattr.o +obj-y += cacheflush.o +obj-y += context.o +obj-y += pgtable.o +obj-y += pmem.o + +ifeq ($(CONFIG_MMU),y) +obj-$(CONFIG_SMP) += tlbflush.o +endif +obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o +obj-$(CONFIG_PTDUMP_CORE) += ptdump.o +obj-$(CONFIG_KASAN) += kasan_init.o + +ifdef CONFIG_KASAN +KASAN_SANITIZE_kasan_init.o := n +KASAN_SANITIZE_init.o := n +ifdef CONFIG_DEBUG_VIRTUAL +KASAN_SANITIZE_physaddr.o := n +endif +endif + +obj-$(CONFIG_DEBUG_VIRTUAL) += physaddr.o +obj-$(CONFIG_RISCV_DMA_NONCOHERENT) += dma-noncoherent.o +obj-$(CONFIG_RISCV_NONSTANDARD_CACHE_OPS) += cache-ops.o diff --git a/arch/riscv/mm/cache-ops.c b/arch/riscv/mm/cache-ops.c new file mode 100644 index 0000000000..a993ad11d0 --- /dev/null +++ b/arch/riscv/mm/cache-ops.c @@ -0,0 +1,17 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2021 Western Digital Corporation or its affiliates. + */ + +#include <asm/dma-noncoherent.h> + +struct riscv_nonstd_cache_ops noncoherent_cache_ops __ro_after_init; + +void +riscv_noncoherent_register_cache_ops(const struct riscv_nonstd_cache_ops *ops) +{ + if (!ops) + return; + noncoherent_cache_ops = *ops; +} +EXPORT_SYMBOL_GPL(riscv_noncoherent_register_cache_ops); diff --git a/arch/riscv/mm/cacheflush.c b/arch/riscv/mm/cacheflush.c new file mode 100644 index 0000000000..f1387272a5 --- /dev/null +++ b/arch/riscv/mm/cacheflush.c @@ -0,0 +1,141 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2017 SiFive + */ + +#include <linux/of.h> +#include <asm/cacheflush.h> + +#ifdef CONFIG_SMP + +#include <asm/sbi.h> + +static void ipi_remote_fence_i(void *info) +{ + return local_flush_icache_all(); +} + +void flush_icache_all(void) +{ + local_flush_icache_all(); + + if (IS_ENABLED(CONFIG_RISCV_SBI) && !riscv_use_ipi_for_rfence()) + sbi_remote_fence_i(NULL); + else + on_each_cpu(ipi_remote_fence_i, NULL, 1); +} +EXPORT_SYMBOL(flush_icache_all); + +/* + * Performs an icache flush for the given MM context. RISC-V has no direct + * mechanism for instruction cache shoot downs, so instead we send an IPI that + * informs the remote harts they need to flush their local instruction caches. + * To avoid pathologically slow behavior in a common case (a bunch of + * single-hart processes on a many-hart machine, ie 'make -j') we avoid the + * IPIs for harts that are not currently executing a MM context and instead + * schedule a deferred local instruction cache flush to be performed before + * execution resumes on each hart. + */ +void flush_icache_mm(struct mm_struct *mm, bool local) +{ + unsigned int cpu; + cpumask_t others, *mask; + + preempt_disable(); + + /* Mark every hart's icache as needing a flush for this MM. */ + mask = &mm->context.icache_stale_mask; + cpumask_setall(mask); + /* Flush this hart's I$ now, and mark it as flushed. */ + cpu = smp_processor_id(); + cpumask_clear_cpu(cpu, mask); + local_flush_icache_all(); + + /* + * Flush the I$ of other harts concurrently executing, and mark them as + * flushed. + */ + cpumask_andnot(&others, mm_cpumask(mm), cpumask_of(cpu)); + local |= cpumask_empty(&others); + if (mm == current->active_mm && local) { + /* + * It's assumed that at least one strongly ordered operation is + * performed on this hart between setting a hart's cpumask bit + * and scheduling this MM context on that hart. Sending an SBI + * remote message will do this, but in the case where no + * messages are sent we still need to order this hart's writes + * with flush_icache_deferred(). + */ + smp_mb(); + } else if (IS_ENABLED(CONFIG_RISCV_SBI) && + !riscv_use_ipi_for_rfence()) { + sbi_remote_fence_i(&others); + } else { + on_each_cpu_mask(&others, ipi_remote_fence_i, NULL, 1); + } + + preempt_enable(); +} + +#endif /* CONFIG_SMP */ + +#ifdef CONFIG_MMU +void flush_icache_pte(pte_t pte) +{ + struct folio *folio = page_folio(pte_page(pte)); + + if (!test_bit(PG_dcache_clean, &folio->flags)) { + flush_icache_all(); + set_bit(PG_dcache_clean, &folio->flags); + } +} +#endif /* CONFIG_MMU */ + +unsigned int riscv_cbom_block_size; +EXPORT_SYMBOL_GPL(riscv_cbom_block_size); + +unsigned int riscv_cboz_block_size; +EXPORT_SYMBOL_GPL(riscv_cboz_block_size); + +static void __init cbo_get_block_size(struct device_node *node, + const char *name, u32 *block_size, + unsigned long *first_hartid) +{ + unsigned long hartid; + u32 val; + + if (riscv_of_processor_hartid(node, &hartid)) + return; + + if (of_property_read_u32(node, name, &val)) + return; + + if (!*block_size) { + *block_size = val; + *first_hartid = hartid; + } else if (*block_size != val) { + pr_warn("%s mismatched between harts %lu and %lu\n", + name, *first_hartid, hartid); + } +} + +void __init riscv_init_cbo_blocksizes(void) +{ + unsigned long cbom_hartid, cboz_hartid; + u32 cbom_block_size = 0, cboz_block_size = 0; + struct device_node *node; + + for_each_of_cpu_node(node) { + /* set block-size for cbom and/or cboz extension if available */ + cbo_get_block_size(node, "riscv,cbom-block-size", + &cbom_block_size, &cbom_hartid); + cbo_get_block_size(node, "riscv,cboz-block-size", + &cboz_block_size, &cboz_hartid); + } + + if (cbom_block_size) + riscv_cbom_block_size = cbom_block_size; + + if (cboz_block_size) + riscv_cboz_block_size = cboz_block_size; +} diff --git a/arch/riscv/mm/context.c b/arch/riscv/mm/context.c new file mode 100644 index 0000000000..217fd4de61 --- /dev/null +++ b/arch/riscv/mm/context.c @@ -0,0 +1,336 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2012 Regents of the University of California + * Copyright (C) 2017 SiFive + * Copyright (C) 2021 Western Digital Corporation or its affiliates. + */ + +#include <linux/bitops.h> +#include <linux/cpumask.h> +#include <linux/mm.h> +#include <linux/percpu.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/static_key.h> +#include <asm/tlbflush.h> +#include <asm/cacheflush.h> +#include <asm/mmu_context.h> + +#ifdef CONFIG_MMU + +DEFINE_STATIC_KEY_FALSE(use_asid_allocator); + +static unsigned long asid_bits; +static unsigned long num_asids; +unsigned long asid_mask; + +static atomic_long_t current_version; + +static DEFINE_RAW_SPINLOCK(context_lock); +static cpumask_t context_tlb_flush_pending; +static unsigned long *context_asid_map; + +static DEFINE_PER_CPU(atomic_long_t, active_context); +static DEFINE_PER_CPU(unsigned long, reserved_context); + +static bool check_update_reserved_context(unsigned long cntx, + unsigned long newcntx) +{ + int cpu; + bool hit = false; + + /* + * Iterate over the set of reserved CONTEXT looking for a match. + * If we find one, then we can update our mm to use new CONTEXT + * (i.e. the same CONTEXT in the current_version) but we can't + * exit the loop early, since we need to ensure that all copies + * of the old CONTEXT are updated to reflect the mm. Failure to do + * so could result in us missing the reserved CONTEXT in a future + * version. + */ + for_each_possible_cpu(cpu) { + if (per_cpu(reserved_context, cpu) == cntx) { + hit = true; + per_cpu(reserved_context, cpu) = newcntx; + } + } + + return hit; +} + +static void __flush_context(void) +{ + int i; + unsigned long cntx; + + /* Must be called with context_lock held */ + lockdep_assert_held(&context_lock); + + /* Update the list of reserved ASIDs and the ASID bitmap. */ + bitmap_zero(context_asid_map, num_asids); + + /* Mark already active ASIDs as used */ + for_each_possible_cpu(i) { + cntx = atomic_long_xchg_relaxed(&per_cpu(active_context, i), 0); + /* + * If this CPU has already been through a rollover, but + * hasn't run another task in the meantime, we must preserve + * its reserved CONTEXT, as this is the only trace we have of + * the process it is still running. + */ + if (cntx == 0) + cntx = per_cpu(reserved_context, i); + + __set_bit(cntx & asid_mask, context_asid_map); + per_cpu(reserved_context, i) = cntx; + } + + /* Mark ASID #0 as used because it is used at boot-time */ + __set_bit(0, context_asid_map); + + /* Queue a TLB invalidation for each CPU on next context-switch */ + cpumask_setall(&context_tlb_flush_pending); +} + +static unsigned long __new_context(struct mm_struct *mm) +{ + static u32 cur_idx = 1; + unsigned long cntx = atomic_long_read(&mm->context.id); + unsigned long asid, ver = atomic_long_read(¤t_version); + + /* Must be called with context_lock held */ + lockdep_assert_held(&context_lock); + + if (cntx != 0) { + unsigned long newcntx = ver | (cntx & asid_mask); + + /* + * If our current CONTEXT was active during a rollover, we + * can continue to use it and this was just a false alarm. + */ + if (check_update_reserved_context(cntx, newcntx)) + return newcntx; + + /* + * We had a valid CONTEXT in a previous life, so try to + * re-use it if possible. + */ + if (!__test_and_set_bit(cntx & asid_mask, context_asid_map)) + return newcntx; + } + + /* + * Allocate a free ASID. If we can't find one then increment + * current_version and flush all ASIDs. + */ + asid = find_next_zero_bit(context_asid_map, num_asids, cur_idx); + if (asid != num_asids) + goto set_asid; + + /* We're out of ASIDs, so increment current_version */ + ver = atomic_long_add_return_relaxed(num_asids, ¤t_version); + + /* Flush everything */ + __flush_context(); + + /* We have more ASIDs than CPUs, so this will always succeed */ + asid = find_next_zero_bit(context_asid_map, num_asids, 1); + +set_asid: + __set_bit(asid, context_asid_map); + cur_idx = asid; + return asid | ver; +} + +static void set_mm_asid(struct mm_struct *mm, unsigned int cpu) +{ + unsigned long flags; + bool need_flush_tlb = false; + unsigned long cntx, old_active_cntx; + + cntx = atomic_long_read(&mm->context.id); + + /* + * If our active_context is non-zero and the context matches the + * current_version, then we update the active_context entry with a + * relaxed cmpxchg. + * + * Following is how we handle racing with a concurrent rollover: + * + * - We get a zero back from the cmpxchg and end up waiting on the + * lock. Taking the lock synchronises with the rollover and so + * we are forced to see the updated verion. + * + * - We get a valid context back from the cmpxchg then we continue + * using old ASID because __flush_context() would have marked ASID + * of active_context as used and next context switch we will + * allocate new context. + */ + old_active_cntx = atomic_long_read(&per_cpu(active_context, cpu)); + if (old_active_cntx && + ((cntx & ~asid_mask) == atomic_long_read(¤t_version)) && + atomic_long_cmpxchg_relaxed(&per_cpu(active_context, cpu), + old_active_cntx, cntx)) + goto switch_mm_fast; + + raw_spin_lock_irqsave(&context_lock, flags); + + /* Check that our ASID belongs to the current_version. */ + cntx = atomic_long_read(&mm->context.id); + if ((cntx & ~asid_mask) != atomic_long_read(¤t_version)) { + cntx = __new_context(mm); + atomic_long_set(&mm->context.id, cntx); + } + + if (cpumask_test_and_clear_cpu(cpu, &context_tlb_flush_pending)) + need_flush_tlb = true; + + atomic_long_set(&per_cpu(active_context, cpu), cntx); + + raw_spin_unlock_irqrestore(&context_lock, flags); + +switch_mm_fast: + csr_write(CSR_SATP, virt_to_pfn(mm->pgd) | + ((cntx & asid_mask) << SATP_ASID_SHIFT) | + satp_mode); + + if (need_flush_tlb) + local_flush_tlb_all(); +} + +static void set_mm_noasid(struct mm_struct *mm) +{ + /* Switch the page table and blindly nuke entire local TLB */ + csr_write(CSR_SATP, virt_to_pfn(mm->pgd) | satp_mode); + local_flush_tlb_all(); +} + +static inline void set_mm(struct mm_struct *prev, + struct mm_struct *next, unsigned int cpu) +{ + /* + * The mm_cpumask indicates which harts' TLBs contain the virtual + * address mapping of the mm. Compared to noasid, using asid + * can't guarantee that stale TLB entries are invalidated because + * the asid mechanism wouldn't flush TLB for every switch_mm for + * performance. So when using asid, keep all CPUs footmarks in + * cpumask() until mm reset. + */ + cpumask_set_cpu(cpu, mm_cpumask(next)); + if (static_branch_unlikely(&use_asid_allocator)) { + set_mm_asid(next, cpu); + } else { + cpumask_clear_cpu(cpu, mm_cpumask(prev)); + set_mm_noasid(next); + } +} + +static int __init asids_init(void) +{ + unsigned long old; + + /* Figure-out number of ASID bits in HW */ + old = csr_read(CSR_SATP); + asid_bits = old | (SATP_ASID_MASK << SATP_ASID_SHIFT); + csr_write(CSR_SATP, asid_bits); + asid_bits = (csr_read(CSR_SATP) >> SATP_ASID_SHIFT) & SATP_ASID_MASK; + asid_bits = fls_long(asid_bits); + csr_write(CSR_SATP, old); + + /* + * In the process of determining number of ASID bits (above) + * we polluted the TLB of current HART so let's do TLB flushed + * to remove unwanted TLB enteries. + */ + local_flush_tlb_all(); + + /* Pre-compute ASID details */ + if (asid_bits) { + num_asids = 1 << asid_bits; + asid_mask = num_asids - 1; + } + + /* + * Use ASID allocator only if number of HW ASIDs are + * at-least twice more than CPUs + */ + if (num_asids > (2 * num_possible_cpus())) { + atomic_long_set(¤t_version, num_asids); + + context_asid_map = bitmap_zalloc(num_asids, GFP_KERNEL); + if (!context_asid_map) + panic("Failed to allocate bitmap for %lu ASIDs\n", + num_asids); + + __set_bit(0, context_asid_map); + + static_branch_enable(&use_asid_allocator); + + pr_info("ASID allocator using %lu bits (%lu entries)\n", + asid_bits, num_asids); + } else { + pr_info("ASID allocator disabled (%lu bits)\n", asid_bits); + } + + return 0; +} +early_initcall(asids_init); +#else +static inline void set_mm(struct mm_struct *prev, + struct mm_struct *next, unsigned int cpu) +{ + /* Nothing to do here when there is no MMU */ +} +#endif + +/* + * When necessary, performs a deferred icache flush for the given MM context, + * on the local CPU. RISC-V has no direct mechanism for instruction cache + * shoot downs, so instead we send an IPI that informs the remote harts they + * need to flush their local instruction caches. To avoid pathologically slow + * behavior in a common case (a bunch of single-hart processes on a many-hart + * machine, ie 'make -j') we avoid the IPIs for harts that are not currently + * executing a MM context and instead schedule a deferred local instruction + * cache flush to be performed before execution resumes on each hart. This + * actually performs that local instruction cache flush, which implicitly only + * refers to the current hart. + * + * The "cpu" argument must be the current local CPU number. + */ +static inline void flush_icache_deferred(struct mm_struct *mm, unsigned int cpu) +{ +#ifdef CONFIG_SMP + cpumask_t *mask = &mm->context.icache_stale_mask; + + if (cpumask_test_cpu(cpu, mask)) { + cpumask_clear_cpu(cpu, mask); + /* + * Ensure the remote hart's writes are visible to this hart. + * This pairs with a barrier in flush_icache_mm. + */ + smp_mb(); + local_flush_icache_all(); + } + +#endif +} + +void switch_mm(struct mm_struct *prev, struct mm_struct *next, + struct task_struct *task) +{ + unsigned int cpu; + + if (unlikely(prev == next)) + return; + + /* + * Mark the current MM context as inactive, and the next as + * active. This is at least used by the icache flushing + * routines in order to determine who should be flushed. + */ + cpu = smp_processor_id(); + + set_mm(prev, next, cpu); + + flush_icache_deferred(next, cpu); +} diff --git a/arch/riscv/mm/dma-noncoherent.c b/arch/riscv/mm/dma-noncoherent.c new file mode 100644 index 0000000000..341bd6706b --- /dev/null +++ b/arch/riscv/mm/dma-noncoherent.c @@ -0,0 +1,158 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * RISC-V specific functions to support DMA for non-coherent devices + * + * Copyright (c) 2021 Western Digital Corporation or its affiliates. + */ + +#include <linux/dma-direct.h> +#include <linux/dma-map-ops.h> +#include <linux/mm.h> +#include <asm/cacheflush.h> +#include <asm/dma-noncoherent.h> + +static bool noncoherent_supported __ro_after_init; +int dma_cache_alignment __ro_after_init = ARCH_DMA_MINALIGN; +EXPORT_SYMBOL_GPL(dma_cache_alignment); + +static inline void arch_dma_cache_wback(phys_addr_t paddr, size_t size) +{ + void *vaddr = phys_to_virt(paddr); + +#ifdef CONFIG_RISCV_NONSTANDARD_CACHE_OPS + if (unlikely(noncoherent_cache_ops.wback)) { + noncoherent_cache_ops.wback(paddr, size); + return; + } +#endif + ALT_CMO_OP(clean, vaddr, size, riscv_cbom_block_size); +} + +static inline void arch_dma_cache_inv(phys_addr_t paddr, size_t size) +{ + void *vaddr = phys_to_virt(paddr); + +#ifdef CONFIG_RISCV_NONSTANDARD_CACHE_OPS + if (unlikely(noncoherent_cache_ops.inv)) { + noncoherent_cache_ops.inv(paddr, size); + return; + } +#endif + + ALT_CMO_OP(inval, vaddr, size, riscv_cbom_block_size); +} + +static inline void arch_dma_cache_wback_inv(phys_addr_t paddr, size_t size) +{ + void *vaddr = phys_to_virt(paddr); + +#ifdef CONFIG_RISCV_NONSTANDARD_CACHE_OPS + if (unlikely(noncoherent_cache_ops.wback_inv)) { + noncoherent_cache_ops.wback_inv(paddr, size); + return; + } +#endif + + ALT_CMO_OP(flush, vaddr, size, riscv_cbom_block_size); +} + +static inline bool arch_sync_dma_clean_before_fromdevice(void) +{ + return true; +} + +static inline bool arch_sync_dma_cpu_needs_post_dma_flush(void) +{ + return true; +} + +void arch_sync_dma_for_device(phys_addr_t paddr, size_t size, + enum dma_data_direction dir) +{ + switch (dir) { + case DMA_TO_DEVICE: + arch_dma_cache_wback(paddr, size); + break; + + case DMA_FROM_DEVICE: + if (!arch_sync_dma_clean_before_fromdevice()) { + arch_dma_cache_inv(paddr, size); + break; + } + fallthrough; + + case DMA_BIDIRECTIONAL: + /* Skip the invalidate here if it's done later */ + if (IS_ENABLED(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) && + arch_sync_dma_cpu_needs_post_dma_flush()) + arch_dma_cache_wback(paddr, size); + else + arch_dma_cache_wback_inv(paddr, size); + break; + + default: + break; + } +} + +void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size, + enum dma_data_direction dir) +{ + switch (dir) { + case DMA_TO_DEVICE: + break; + + case DMA_FROM_DEVICE: + case DMA_BIDIRECTIONAL: + /* FROM_DEVICE invalidate needed if speculative CPU prefetch only */ + if (arch_sync_dma_cpu_needs_post_dma_flush()) + arch_dma_cache_inv(paddr, size); + break; + + default: + break; + } +} + +void arch_dma_prep_coherent(struct page *page, size_t size) +{ + void *flush_addr = page_address(page); + +#ifdef CONFIG_RISCV_NONSTANDARD_CACHE_OPS + if (unlikely(noncoherent_cache_ops.wback_inv)) { + noncoherent_cache_ops.wback_inv(page_to_phys(page), size); + return; + } +#endif + + ALT_CMO_OP(flush, flush_addr, size, riscv_cbom_block_size); +} + +void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size, + const struct iommu_ops *iommu, bool coherent) +{ + WARN_TAINT(!coherent && riscv_cbom_block_size > ARCH_DMA_MINALIGN, + TAINT_CPU_OUT_OF_SPEC, + "%s %s: ARCH_DMA_MINALIGN smaller than riscv,cbom-block-size (%d < %d)", + dev_driver_string(dev), dev_name(dev), + ARCH_DMA_MINALIGN, riscv_cbom_block_size); + + WARN_TAINT(!coherent && !noncoherent_supported, TAINT_CPU_OUT_OF_SPEC, + "%s %s: device non-coherent but no non-coherent operations supported", + dev_driver_string(dev), dev_name(dev)); + + dev->dma_coherent = coherent; +} + +void riscv_noncoherent_supported(void) +{ + WARN(!riscv_cbom_block_size, + "Non-coherent DMA support enabled without a block size\n"); + noncoherent_supported = true; +} + +void __init riscv_set_dma_cache_alignment(void) +{ + if (!noncoherent_supported) + dma_cache_alignment = 1; +} diff --git a/arch/riscv/mm/extable.c b/arch/riscv/mm/extable.c new file mode 100644 index 0000000000..35484d830f --- /dev/null +++ b/arch/riscv/mm/extable.c @@ -0,0 +1,71 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2009 Sunplus Core Technology Co., Ltd. + * Lennox Wu <lennox.wu@sunplusct.com> + * Chen Liqin <liqin.chen@sunplusct.com> + * Copyright (C) 2013 Regents of the University of California + */ + + +#include <linux/bitfield.h> +#include <linux/extable.h> +#include <linux/module.h> +#include <linux/uaccess.h> +#include <asm/asm-extable.h> +#include <asm/ptrace.h> + +static inline unsigned long +get_ex_fixup(const struct exception_table_entry *ex) +{ + return ((unsigned long)&ex->fixup + ex->fixup); +} + +static bool ex_handler_fixup(const struct exception_table_entry *ex, + struct pt_regs *regs) +{ + regs->epc = get_ex_fixup(ex); + return true; +} + +static inline void regs_set_gpr(struct pt_regs *regs, unsigned int offset, + unsigned long val) +{ + if (unlikely(offset > MAX_REG_OFFSET)) + return; + + if (offset) + *(unsigned long *)((unsigned long)regs + offset) = val; +} + +static bool ex_handler_uaccess_err_zero(const struct exception_table_entry *ex, + struct pt_regs *regs) +{ + int reg_err = FIELD_GET(EX_DATA_REG_ERR, ex->data); + int reg_zero = FIELD_GET(EX_DATA_REG_ZERO, ex->data); + + regs_set_gpr(regs, reg_err * sizeof(unsigned long), -EFAULT); + regs_set_gpr(regs, reg_zero * sizeof(unsigned long), 0); + + regs->epc = get_ex_fixup(ex); + return true; +} + +bool fixup_exception(struct pt_regs *regs) +{ + const struct exception_table_entry *ex; + + ex = search_exception_tables(regs->epc); + if (!ex) + return false; + + switch (ex->type) { + case EX_TYPE_FIXUP: + return ex_handler_fixup(ex, regs); + case EX_TYPE_BPF: + return ex_handler_bpf(ex, regs); + case EX_TYPE_UACCESS_ERR_ZERO: + return ex_handler_uaccess_err_zero(ex, regs); + } + + BUG(); +} diff --git a/arch/riscv/mm/fault.c b/arch/riscv/mm/fault.c new file mode 100644 index 0000000000..90d4ba36d1 --- /dev/null +++ b/arch/riscv/mm/fault.c @@ -0,0 +1,377 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2009 Sunplus Core Technology Co., Ltd. + * Lennox Wu <lennox.wu@sunplusct.com> + * Chen Liqin <liqin.chen@sunplusct.com> + * Copyright (C) 2012 Regents of the University of California + */ + + +#include <linux/mm.h> +#include <linux/kernel.h> +#include <linux/interrupt.h> +#include <linux/perf_event.h> +#include <linux/signal.h> +#include <linux/uaccess.h> +#include <linux/kprobes.h> +#include <linux/kfence.h> +#include <linux/entry-common.h> + +#include <asm/ptrace.h> +#include <asm/tlbflush.h> + +#include "../kernel/head.h" + +static void die_kernel_fault(const char *msg, unsigned long addr, + struct pt_regs *regs) +{ + bust_spinlocks(1); + + pr_alert("Unable to handle kernel %s at virtual address " REG_FMT "\n", msg, + addr); + + bust_spinlocks(0); + die(regs, "Oops"); + make_task_dead(SIGKILL); +} + +static inline void no_context(struct pt_regs *regs, unsigned long addr) +{ + const char *msg; + + /* Are we prepared to handle this kernel fault? */ + if (fixup_exception(regs)) + return; + + /* + * Oops. The kernel tried to access some bad page. We'll have to + * terminate things with extreme prejudice. + */ + if (addr < PAGE_SIZE) + msg = "NULL pointer dereference"; + else { + if (kfence_handle_page_fault(addr, regs->cause == EXC_STORE_PAGE_FAULT, regs)) + return; + + msg = "paging request"; + } + + die_kernel_fault(msg, addr, regs); +} + +static inline void mm_fault_error(struct pt_regs *regs, unsigned long addr, vm_fault_t fault) +{ + if (fault & VM_FAULT_OOM) { + /* + * We ran out of memory, call the OOM killer, and return the userspace + * (which will retry the fault, or kill us if we got oom-killed). + */ + if (!user_mode(regs)) { + no_context(regs, addr); + return; + } + pagefault_out_of_memory(); + return; + } else if (fault & (VM_FAULT_SIGBUS | VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE)) { + /* Kernel mode? Handle exceptions or die */ + if (!user_mode(regs)) { + no_context(regs, addr); + return; + } + do_trap(regs, SIGBUS, BUS_ADRERR, addr); + return; + } + BUG(); +} + +static inline void +bad_area_nosemaphore(struct pt_regs *regs, int code, unsigned long addr) +{ + /* + * Something tried to access memory that isn't in our memory map. + * Fix it, but check if it's kernel or user first. + */ + /* User mode accesses just cause a SIGSEGV */ + if (user_mode(regs)) { + do_trap(regs, SIGSEGV, code, addr); + return; + } + + no_context(regs, addr); +} + +static inline void +bad_area(struct pt_regs *regs, struct mm_struct *mm, int code, + unsigned long addr) +{ + mmap_read_unlock(mm); + + bad_area_nosemaphore(regs, code, addr); +} + +static inline void vmalloc_fault(struct pt_regs *regs, int code, unsigned long addr) +{ + pgd_t *pgd, *pgd_k; + pud_t *pud_k; + p4d_t *p4d_k; + pmd_t *pmd_k; + pte_t *pte_k; + int index; + unsigned long pfn; + + /* User mode accesses just cause a SIGSEGV */ + if (user_mode(regs)) + return do_trap(regs, SIGSEGV, code, addr); + + /* + * Synchronize this task's top level page-table + * with the 'reference' page table. + * + * Do _not_ use "tsk->active_mm->pgd" here. + * We might be inside an interrupt in the middle + * of a task switch. + */ + index = pgd_index(addr); + pfn = csr_read(CSR_SATP) & SATP_PPN; + pgd = (pgd_t *)pfn_to_virt(pfn) + index; + pgd_k = init_mm.pgd + index; + + if (!pgd_present(*pgd_k)) { + no_context(regs, addr); + return; + } + set_pgd(pgd, *pgd_k); + + p4d_k = p4d_offset(pgd_k, addr); + if (!p4d_present(*p4d_k)) { + no_context(regs, addr); + return; + } + + pud_k = pud_offset(p4d_k, addr); + if (!pud_present(*pud_k)) { + no_context(regs, addr); + return; + } + if (pud_leaf(*pud_k)) + goto flush_tlb; + + /* + * Since the vmalloc area is global, it is unnecessary + * to copy individual PTEs + */ + pmd_k = pmd_offset(pud_k, addr); + if (!pmd_present(*pmd_k)) { + no_context(regs, addr); + return; + } + if (pmd_leaf(*pmd_k)) + goto flush_tlb; + + /* + * Make sure the actual PTE exists as well to + * catch kernel vmalloc-area accesses to non-mapped + * addresses. If we don't do this, this will just + * silently loop forever. + */ + pte_k = pte_offset_kernel(pmd_k, addr); + if (!pte_present(*pte_k)) { + no_context(regs, addr); + return; + } + + /* + * The kernel assumes that TLBs don't cache invalid + * entries, but in RISC-V, SFENCE.VMA specifies an + * ordering constraint, not a cache flush; it is + * necessary even after writing invalid entries. + */ +flush_tlb: + local_flush_tlb_page(addr); +} + +static inline bool access_error(unsigned long cause, struct vm_area_struct *vma) +{ + switch (cause) { + case EXC_INST_PAGE_FAULT: + if (!(vma->vm_flags & VM_EXEC)) { + return true; + } + break; + case EXC_LOAD_PAGE_FAULT: + /* Write implies read */ + if (!(vma->vm_flags & (VM_READ | VM_WRITE))) { + return true; + } + break; + case EXC_STORE_PAGE_FAULT: + if (!(vma->vm_flags & VM_WRITE)) { + return true; + } + break; + default: + panic("%s: unhandled cause %lu", __func__, cause); + } + return false; +} + +/* + * This routine handles page faults. It determines the address and the + * problem, and then passes it off to one of the appropriate routines. + */ +void handle_page_fault(struct pt_regs *regs) +{ + struct task_struct *tsk; + struct vm_area_struct *vma; + struct mm_struct *mm; + unsigned long addr, cause; + unsigned int flags = FAULT_FLAG_DEFAULT; + int code = SEGV_MAPERR; + vm_fault_t fault; + + cause = regs->cause; + addr = regs->badaddr; + + tsk = current; + mm = tsk->mm; + + if (kprobe_page_fault(regs, cause)) + return; + + /* + * Fault-in kernel-space virtual memory on-demand. + * The 'reference' page table is init_mm.pgd. + * + * NOTE! We MUST NOT take any locks for this case. We may + * be in an interrupt or a critical region, and should + * only copy the information from the master page table, + * nothing more. + */ + if ((!IS_ENABLED(CONFIG_MMU) || !IS_ENABLED(CONFIG_64BIT)) && + unlikely(addr >= VMALLOC_START && addr < VMALLOC_END)) { + vmalloc_fault(regs, code, addr); + return; + } + + /* Enable interrupts if they were enabled in the parent context. */ + if (!regs_irqs_disabled(regs)) + local_irq_enable(); + + /* + * If we're in an interrupt, have no user context, or are running + * in an atomic region, then we must not take the fault. + */ + if (unlikely(faulthandler_disabled() || !mm)) { + tsk->thread.bad_cause = cause; + no_context(regs, addr); + return; + } + + if (user_mode(regs)) + flags |= FAULT_FLAG_USER; + + if (!user_mode(regs) && addr < TASK_SIZE && unlikely(!(regs->status & SR_SUM))) { + if (fixup_exception(regs)) + return; + + die_kernel_fault("access to user memory without uaccess routines", addr, regs); + } + + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr); + + if (cause == EXC_STORE_PAGE_FAULT) + flags |= FAULT_FLAG_WRITE; + else if (cause == EXC_INST_PAGE_FAULT) + flags |= FAULT_FLAG_INSTRUCTION; + if (!(flags & FAULT_FLAG_USER)) + goto lock_mmap; + + vma = lock_vma_under_rcu(mm, addr); + if (!vma) + goto lock_mmap; + + if (unlikely(access_error(cause, vma))) { + vma_end_read(vma); + goto lock_mmap; + } + + fault = handle_mm_fault(vma, addr, flags | FAULT_FLAG_VMA_LOCK, regs); + if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED))) + vma_end_read(vma); + + if (!(fault & VM_FAULT_RETRY)) { + count_vm_vma_lock_event(VMA_LOCK_SUCCESS); + goto done; + } + count_vm_vma_lock_event(VMA_LOCK_RETRY); + + if (fault_signal_pending(fault, regs)) { + if (!user_mode(regs)) + no_context(regs, addr); + return; + } +lock_mmap: + +retry: + vma = lock_mm_and_find_vma(mm, addr, regs); + if (unlikely(!vma)) { + tsk->thread.bad_cause = cause; + bad_area_nosemaphore(regs, code, addr); + return; + } + + /* + * Ok, we have a good vm_area for this memory access, so + * we can handle it. + */ + code = SEGV_ACCERR; + + if (unlikely(access_error(cause, vma))) { + tsk->thread.bad_cause = cause; + bad_area(regs, mm, code, addr); + return; + } + + /* + * If for any reason at all we could not handle the fault, + * make sure we exit gracefully rather than endlessly redo + * the fault. + */ + fault = handle_mm_fault(vma, addr, flags, regs); + + /* + * If we need to retry but a fatal signal is pending, handle the + * signal first. We do not need to release the mmap_lock because it + * would already be released in __lock_page_or_retry in mm/filemap.c. + */ + if (fault_signal_pending(fault, regs)) { + if (!user_mode(regs)) + no_context(regs, addr); + return; + } + + /* The fault is fully completed (including releasing mmap lock) */ + if (fault & VM_FAULT_COMPLETED) + return; + + if (unlikely(fault & VM_FAULT_RETRY)) { + flags |= FAULT_FLAG_TRIED; + + /* + * No need to mmap_read_unlock(mm) as we would + * have already released it in __lock_page_or_retry + * in mm/filemap.c. + */ + goto retry; + } + + mmap_read_unlock(mm); + +done: + if (unlikely(fault & VM_FAULT_ERROR)) { + tsk->thread.bad_cause = cause; + mm_fault_error(regs, addr, fault); + return; + } + return; +} diff --git a/arch/riscv/mm/hugetlbpage.c b/arch/riscv/mm/hugetlbpage.c new file mode 100644 index 0000000000..b52f021048 --- /dev/null +++ b/arch/riscv/mm/hugetlbpage.c @@ -0,0 +1,375 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/hugetlb.h> +#include <linux/err.h> + +#ifdef CONFIG_RISCV_ISA_SVNAPOT +pte_t huge_ptep_get(pte_t *ptep) +{ + unsigned long pte_num; + int i; + pte_t orig_pte = ptep_get(ptep); + + if (!pte_present(orig_pte) || !pte_napot(orig_pte)) + return orig_pte; + + pte_num = napot_pte_num(napot_cont_order(orig_pte)); + + for (i = 0; i < pte_num; i++, ptep++) { + pte_t pte = ptep_get(ptep); + + if (pte_dirty(pte)) + orig_pte = pte_mkdirty(orig_pte); + + if (pte_young(pte)) + orig_pte = pte_mkyoung(orig_pte); + } + + return orig_pte; +} + +pte_t *huge_pte_alloc(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long addr, + unsigned long sz) +{ + unsigned long order; + pte_t *pte = NULL; + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + + pgd = pgd_offset(mm, addr); + p4d = p4d_alloc(mm, pgd, addr); + if (!p4d) + return NULL; + + pud = pud_alloc(mm, p4d, addr); + if (!pud) + return NULL; + + if (sz == PUD_SIZE) { + pte = (pte_t *)pud; + goto out; + } + + if (sz == PMD_SIZE) { + if (want_pmd_share(vma, addr) && pud_none(*pud)) + pte = huge_pmd_share(mm, vma, addr, pud); + else + pte = (pte_t *)pmd_alloc(mm, pud, addr); + goto out; + } + + pmd = pmd_alloc(mm, pud, addr); + if (!pmd) + return NULL; + + for_each_napot_order(order) { + if (napot_cont_size(order) == sz) { + pte = pte_alloc_huge(mm, pmd, addr & napot_cont_mask(order)); + break; + } + } + +out: + if (pte) { + pte_t pteval = ptep_get_lockless(pte); + + WARN_ON_ONCE(pte_present(pteval) && !pte_huge(pteval)); + } + return pte; +} + +pte_t *huge_pte_offset(struct mm_struct *mm, + unsigned long addr, + unsigned long sz) +{ + unsigned long order; + pte_t *pte = NULL; + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + + pgd = pgd_offset(mm, addr); + if (!pgd_present(*pgd)) + return NULL; + + p4d = p4d_offset(pgd, addr); + if (!p4d_present(*p4d)) + return NULL; + + pud = pud_offset(p4d, addr); + if (sz == PUD_SIZE) + /* must be pud huge, non-present or none */ + return (pte_t *)pud; + + if (!pud_present(*pud)) + return NULL; + + pmd = pmd_offset(pud, addr); + if (sz == PMD_SIZE) + /* must be pmd huge, non-present or none */ + return (pte_t *)pmd; + + if (!pmd_present(*pmd)) + return NULL; + + for_each_napot_order(order) { + if (napot_cont_size(order) == sz) { + pte = pte_offset_huge(pmd, addr & napot_cont_mask(order)); + break; + } + } + return pte; +} + +static pte_t get_clear_contig(struct mm_struct *mm, + unsigned long addr, + pte_t *ptep, + unsigned long pte_num) +{ + pte_t orig_pte = ptep_get(ptep); + unsigned long i; + + for (i = 0; i < pte_num; i++, addr += PAGE_SIZE, ptep++) { + pte_t pte = ptep_get_and_clear(mm, addr, ptep); + + if (pte_dirty(pte)) + orig_pte = pte_mkdirty(orig_pte); + + if (pte_young(pte)) + orig_pte = pte_mkyoung(orig_pte); + } + + return orig_pte; +} + +static pte_t get_clear_contig_flush(struct mm_struct *mm, + unsigned long addr, + pte_t *ptep, + unsigned long pte_num) +{ + pte_t orig_pte = get_clear_contig(mm, addr, ptep, pte_num); + struct vm_area_struct vma = TLB_FLUSH_VMA(mm, 0); + bool valid = !pte_none(orig_pte); + + if (valid) + flush_tlb_range(&vma, addr, addr + (PAGE_SIZE * pte_num)); + + return orig_pte; +} + +pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags) +{ + unsigned long order; + + for_each_napot_order(order) { + if (shift == napot_cont_shift(order)) { + entry = pte_mknapot(entry, order); + break; + } + } + if (order == NAPOT_ORDER_MAX) + entry = pte_mkhuge(entry); + + return entry; +} + +void set_huge_pte_at(struct mm_struct *mm, + unsigned long addr, + pte_t *ptep, + pte_t pte, + unsigned long sz) +{ + unsigned long hugepage_shift; + int i, pte_num; + + if (sz >= PGDIR_SIZE) + hugepage_shift = PGDIR_SHIFT; + else if (sz >= P4D_SIZE) + hugepage_shift = P4D_SHIFT; + else if (sz >= PUD_SIZE) + hugepage_shift = PUD_SHIFT; + else if (sz >= PMD_SIZE) + hugepage_shift = PMD_SHIFT; + else + hugepage_shift = PAGE_SHIFT; + + pte_num = sz >> hugepage_shift; + for (i = 0; i < pte_num; i++, ptep++, addr += (1 << hugepage_shift)) + set_pte_at(mm, addr, ptep, pte); +} + +int huge_ptep_set_access_flags(struct vm_area_struct *vma, + unsigned long addr, + pte_t *ptep, + pte_t pte, + int dirty) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned long order; + pte_t orig_pte; + int i, pte_num; + + if (!pte_napot(pte)) + return ptep_set_access_flags(vma, addr, ptep, pte, dirty); + + order = napot_cont_order(pte); + pte_num = napot_pte_num(order); + ptep = huge_pte_offset(mm, addr, napot_cont_size(order)); + orig_pte = get_clear_contig_flush(mm, addr, ptep, pte_num); + + if (pte_dirty(orig_pte)) + pte = pte_mkdirty(pte); + + if (pte_young(orig_pte)) + pte = pte_mkyoung(pte); + + for (i = 0; i < pte_num; i++, addr += PAGE_SIZE, ptep++) + set_pte_at(mm, addr, ptep, pte); + + return true; +} + +pte_t huge_ptep_get_and_clear(struct mm_struct *mm, + unsigned long addr, + pte_t *ptep) +{ + pte_t orig_pte = ptep_get(ptep); + int pte_num; + + if (!pte_napot(orig_pte)) + return ptep_get_and_clear(mm, addr, ptep); + + pte_num = napot_pte_num(napot_cont_order(orig_pte)); + + return get_clear_contig(mm, addr, ptep, pte_num); +} + +void huge_ptep_set_wrprotect(struct mm_struct *mm, + unsigned long addr, + pte_t *ptep) +{ + pte_t pte = ptep_get(ptep); + unsigned long order; + pte_t orig_pte; + int i, pte_num; + + if (!pte_napot(pte)) { + ptep_set_wrprotect(mm, addr, ptep); + return; + } + + order = napot_cont_order(pte); + pte_num = napot_pte_num(order); + ptep = huge_pte_offset(mm, addr, napot_cont_size(order)); + orig_pte = get_clear_contig_flush(mm, addr, ptep, pte_num); + + orig_pte = pte_wrprotect(orig_pte); + + for (i = 0; i < pte_num; i++, addr += PAGE_SIZE, ptep++) + set_pte_at(mm, addr, ptep, orig_pte); +} + +pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, + unsigned long addr, + pte_t *ptep) +{ + pte_t pte = ptep_get(ptep); + int pte_num; + + if (!pte_napot(pte)) + return ptep_clear_flush(vma, addr, ptep); + + pte_num = napot_pte_num(napot_cont_order(pte)); + + return get_clear_contig_flush(vma->vm_mm, addr, ptep, pte_num); +} + +void huge_pte_clear(struct mm_struct *mm, + unsigned long addr, + pte_t *ptep, + unsigned long sz) +{ + pte_t pte = READ_ONCE(*ptep); + int i, pte_num; + + if (!pte_napot(pte)) { + pte_clear(mm, addr, ptep); + return; + } + + pte_num = napot_pte_num(napot_cont_order(pte)); + for (i = 0; i < pte_num; i++, addr += PAGE_SIZE, ptep++) + pte_clear(mm, addr, ptep); +} + +static __init bool is_napot_size(unsigned long size) +{ + unsigned long order; + + if (!has_svnapot()) + return false; + + for_each_napot_order(order) { + if (size == napot_cont_size(order)) + return true; + } + return false; +} + +static __init int napot_hugetlbpages_init(void) +{ + if (has_svnapot()) { + unsigned long order; + + for_each_napot_order(order) + hugetlb_add_hstate(order); + } + return 0; +} +arch_initcall(napot_hugetlbpages_init); + +#else + +static __init bool is_napot_size(unsigned long size) +{ + return false; +} + +#endif /*CONFIG_RISCV_ISA_SVNAPOT*/ + +int pud_huge(pud_t pud) +{ + return pud_leaf(pud); +} + +int pmd_huge(pmd_t pmd) +{ + return pmd_leaf(pmd); +} + +bool __init arch_hugetlb_valid_size(unsigned long size) +{ + if (size == HPAGE_SIZE) + return true; + else if (IS_ENABLED(CONFIG_64BIT) && size == PUD_SIZE) + return true; + else if (is_napot_size(size)) + return true; + else + return false; +} + +#ifdef CONFIG_CONTIG_ALLOC +static __init int gigantic_pages_init(void) +{ + /* With CONTIG_ALLOC, we can allocate gigantic pages at runtime */ + if (IS_ENABLED(CONFIG_64BIT)) + hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT); + return 0; +} +arch_initcall(gigantic_pages_init); +#endif diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c new file mode 100644 index 0000000000..0798bd861d --- /dev/null +++ b/arch/riscv/mm/init.c @@ -0,0 +1,1566 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2012 Regents of the University of California + * Copyright (C) 2019 Western Digital Corporation or its affiliates. + * Copyright (C) 2020 FORTH-ICS/CARV + * Nick Kossifidis <mick@ics.forth.gr> + */ + +#include <linux/init.h> +#include <linux/mm.h> +#include <linux/memblock.h> +#include <linux/initrd.h> +#include <linux/swap.h> +#include <linux/swiotlb.h> +#include <linux/sizes.h> +#include <linux/of_fdt.h> +#include <linux/of_reserved_mem.h> +#include <linux/libfdt.h> +#include <linux/set_memory.h> +#include <linux/dma-map-ops.h> +#include <linux/crash_dump.h> +#include <linux/hugetlb.h> +#ifdef CONFIG_RELOCATABLE +#include <linux/elf.h> +#endif +#include <linux/kfence.h> + +#include <asm/fixmap.h> +#include <asm/io.h> +#include <asm/numa.h> +#include <asm/pgtable.h> +#include <asm/ptdump.h> +#include <asm/sections.h> +#include <asm/soc.h> +#include <asm/tlbflush.h> + +#include "../kernel/head.h" + +struct kernel_mapping kernel_map __ro_after_init; +EXPORT_SYMBOL(kernel_map); +#ifdef CONFIG_XIP_KERNEL +#define kernel_map (*(struct kernel_mapping *)XIP_FIXUP(&kernel_map)) +#endif + +#ifdef CONFIG_64BIT +u64 satp_mode __ro_after_init = !IS_ENABLED(CONFIG_XIP_KERNEL) ? SATP_MODE_57 : SATP_MODE_39; +#else +u64 satp_mode __ro_after_init = SATP_MODE_32; +#endif +EXPORT_SYMBOL(satp_mode); + +bool pgtable_l4_enabled = IS_ENABLED(CONFIG_64BIT) && !IS_ENABLED(CONFIG_XIP_KERNEL); +bool pgtable_l5_enabled = IS_ENABLED(CONFIG_64BIT) && !IS_ENABLED(CONFIG_XIP_KERNEL); +EXPORT_SYMBOL(pgtable_l4_enabled); +EXPORT_SYMBOL(pgtable_l5_enabled); + +phys_addr_t phys_ram_base __ro_after_init; +EXPORT_SYMBOL(phys_ram_base); + +unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] + __page_aligned_bss; +EXPORT_SYMBOL(empty_zero_page); + +extern char _start[]; +void *_dtb_early_va __initdata; +uintptr_t _dtb_early_pa __initdata; + +static phys_addr_t dma32_phys_limit __initdata; + +static void __init zone_sizes_init(void) +{ + unsigned long max_zone_pfns[MAX_NR_ZONES] = { 0, }; + +#ifdef CONFIG_ZONE_DMA32 + max_zone_pfns[ZONE_DMA32] = PFN_DOWN(dma32_phys_limit); +#endif + max_zone_pfns[ZONE_NORMAL] = max_low_pfn; + + free_area_init(max_zone_pfns); +} + +#if defined(CONFIG_MMU) && defined(CONFIG_DEBUG_VM) + +#define LOG2_SZ_1K ilog2(SZ_1K) +#define LOG2_SZ_1M ilog2(SZ_1M) +#define LOG2_SZ_1G ilog2(SZ_1G) +#define LOG2_SZ_1T ilog2(SZ_1T) + +static inline void print_mlk(char *name, unsigned long b, unsigned long t) +{ + pr_notice("%12s : 0x%08lx - 0x%08lx (%4ld kB)\n", name, b, t, + (((t) - (b)) >> LOG2_SZ_1K)); +} + +static inline void print_mlm(char *name, unsigned long b, unsigned long t) +{ + pr_notice("%12s : 0x%08lx - 0x%08lx (%4ld MB)\n", name, b, t, + (((t) - (b)) >> LOG2_SZ_1M)); +} + +static inline void print_mlg(char *name, unsigned long b, unsigned long t) +{ + pr_notice("%12s : 0x%08lx - 0x%08lx (%4ld GB)\n", name, b, t, + (((t) - (b)) >> LOG2_SZ_1G)); +} + +#ifdef CONFIG_64BIT +static inline void print_mlt(char *name, unsigned long b, unsigned long t) +{ + pr_notice("%12s : 0x%08lx - 0x%08lx (%4ld TB)\n", name, b, t, + (((t) - (b)) >> LOG2_SZ_1T)); +} +#else +#define print_mlt(n, b, t) do {} while (0) +#endif + +static inline void print_ml(char *name, unsigned long b, unsigned long t) +{ + unsigned long diff = t - b; + + if (IS_ENABLED(CONFIG_64BIT) && (diff >> LOG2_SZ_1T) >= 10) + print_mlt(name, b, t); + else if ((diff >> LOG2_SZ_1G) >= 10) + print_mlg(name, b, t); + else if ((diff >> LOG2_SZ_1M) >= 10) + print_mlm(name, b, t); + else + print_mlk(name, b, t); +} + +static void __init print_vm_layout(void) +{ + pr_notice("Virtual kernel memory layout:\n"); + print_ml("fixmap", (unsigned long)FIXADDR_START, + (unsigned long)FIXADDR_TOP); + print_ml("pci io", (unsigned long)PCI_IO_START, + (unsigned long)PCI_IO_END); + print_ml("vmemmap", (unsigned long)VMEMMAP_START, + (unsigned long)VMEMMAP_END); + print_ml("vmalloc", (unsigned long)VMALLOC_START, + (unsigned long)VMALLOC_END); +#ifdef CONFIG_64BIT + print_ml("modules", (unsigned long)MODULES_VADDR, + (unsigned long)MODULES_END); +#endif + print_ml("lowmem", (unsigned long)PAGE_OFFSET, + (unsigned long)high_memory); + if (IS_ENABLED(CONFIG_64BIT)) { +#ifdef CONFIG_KASAN + print_ml("kasan", KASAN_SHADOW_START, KASAN_SHADOW_END); +#endif + + print_ml("kernel", (unsigned long)kernel_map.virt_addr, + (unsigned long)ADDRESS_SPACE_END); + } +} +#else +static void print_vm_layout(void) { } +#endif /* CONFIG_DEBUG_VM */ + +void __init mem_init(void) +{ +#ifdef CONFIG_FLATMEM + BUG_ON(!mem_map); +#endif /* CONFIG_FLATMEM */ + + swiotlb_init(max_pfn > PFN_DOWN(dma32_phys_limit), SWIOTLB_VERBOSE); + memblock_free_all(); + + print_vm_layout(); +} + +/* Limit the memory size via mem. */ +static phys_addr_t memory_limit; + +static int __init early_mem(char *p) +{ + u64 size; + + if (!p) + return 1; + + size = memparse(p, &p) & PAGE_MASK; + memory_limit = min_t(u64, size, memory_limit); + + pr_notice("Memory limited to %lldMB\n", (u64)memory_limit >> 20); + + return 0; +} +early_param("mem", early_mem); + +static void __init setup_bootmem(void) +{ + phys_addr_t vmlinux_end = __pa_symbol(&_end); + phys_addr_t max_mapped_addr; + phys_addr_t phys_ram_end, vmlinux_start; + + if (IS_ENABLED(CONFIG_XIP_KERNEL)) + vmlinux_start = __pa_symbol(&_sdata); + else + vmlinux_start = __pa_symbol(&_start); + + memblock_enforce_memory_limit(memory_limit); + + /* + * Make sure we align the reservation on PMD_SIZE since we will + * map the kernel in the linear mapping as read-only: we do not want + * any allocation to happen between _end and the next pmd aligned page. + */ + if (IS_ENABLED(CONFIG_64BIT) && IS_ENABLED(CONFIG_STRICT_KERNEL_RWX)) + vmlinux_end = (vmlinux_end + PMD_SIZE - 1) & PMD_MASK; + /* + * Reserve from the start of the kernel to the end of the kernel + */ + memblock_reserve(vmlinux_start, vmlinux_end - vmlinux_start); + + phys_ram_end = memblock_end_of_DRAM(); + + /* + * Make sure we align the start of the memory on a PMD boundary so that + * at worst, we map the linear mapping with PMD mappings. + */ + if (!IS_ENABLED(CONFIG_XIP_KERNEL)) + phys_ram_base = memblock_start_of_DRAM() & PMD_MASK; + + /* + * In 64-bit, any use of __va/__pa before this point is wrong as we + * did not know the start of DRAM before. + */ + if (IS_ENABLED(CONFIG_64BIT)) + kernel_map.va_pa_offset = PAGE_OFFSET - phys_ram_base; + + /* + * memblock allocator is not aware of the fact that last 4K bytes of + * the addressable memory can not be mapped because of IS_ERR_VALUE + * macro. Make sure that last 4k bytes are not usable by memblock + * if end of dram is equal to maximum addressable memory. For 64-bit + * kernel, this problem can't happen here as the end of the virtual + * address space is occupied by the kernel mapping then this check must + * be done as soon as the kernel mapping base address is determined. + */ + if (!IS_ENABLED(CONFIG_64BIT)) { + max_mapped_addr = __pa(~(ulong)0); + if (max_mapped_addr == (phys_ram_end - 1)) + memblock_set_current_limit(max_mapped_addr - 4096); + } + + min_low_pfn = PFN_UP(phys_ram_base); + max_low_pfn = max_pfn = PFN_DOWN(phys_ram_end); + high_memory = (void *)(__va(PFN_PHYS(max_low_pfn))); + + dma32_phys_limit = min(4UL * SZ_1G, (unsigned long)PFN_PHYS(max_low_pfn)); + set_max_mapnr(max_low_pfn - ARCH_PFN_OFFSET); + + reserve_initrd_mem(); + + /* + * No allocation should be done before reserving the memory as defined + * in the device tree, otherwise the allocation could end up in a + * reserved region. + */ + early_init_fdt_scan_reserved_mem(); + + /* + * If DTB is built in, no need to reserve its memblock. + * Otherwise, do reserve it but avoid using + * early_init_fdt_reserve_self() since __pa() does + * not work for DTB pointers that are fixmap addresses + */ + if (!IS_ENABLED(CONFIG_BUILTIN_DTB)) + memblock_reserve(dtb_early_pa, fdt_totalsize(dtb_early_va)); + + dma_contiguous_reserve(dma32_phys_limit); + if (IS_ENABLED(CONFIG_64BIT)) + hugetlb_cma_reserve(PUD_SHIFT - PAGE_SHIFT); +} + +#ifdef CONFIG_MMU +struct pt_alloc_ops pt_ops __initdata; + +pgd_t swapper_pg_dir[PTRS_PER_PGD] __page_aligned_bss; +pgd_t trampoline_pg_dir[PTRS_PER_PGD] __page_aligned_bss; +static pte_t fixmap_pte[PTRS_PER_PTE] __page_aligned_bss; + +pgd_t early_pg_dir[PTRS_PER_PGD] __initdata __aligned(PAGE_SIZE); + +#ifdef CONFIG_XIP_KERNEL +#define pt_ops (*(struct pt_alloc_ops *)XIP_FIXUP(&pt_ops)) +#define trampoline_pg_dir ((pgd_t *)XIP_FIXUP(trampoline_pg_dir)) +#define fixmap_pte ((pte_t *)XIP_FIXUP(fixmap_pte)) +#define early_pg_dir ((pgd_t *)XIP_FIXUP(early_pg_dir)) +#endif /* CONFIG_XIP_KERNEL */ + +static const pgprot_t protection_map[16] = { + [VM_NONE] = PAGE_NONE, + [VM_READ] = PAGE_READ, + [VM_WRITE] = PAGE_COPY, + [VM_WRITE | VM_READ] = PAGE_COPY, + [VM_EXEC] = PAGE_EXEC, + [VM_EXEC | VM_READ] = PAGE_READ_EXEC, + [VM_EXEC | VM_WRITE] = PAGE_COPY_EXEC, + [VM_EXEC | VM_WRITE | VM_READ] = PAGE_COPY_EXEC, + [VM_SHARED] = PAGE_NONE, + [VM_SHARED | VM_READ] = PAGE_READ, + [VM_SHARED | VM_WRITE] = PAGE_SHARED, + [VM_SHARED | VM_WRITE | VM_READ] = PAGE_SHARED, + [VM_SHARED | VM_EXEC] = PAGE_EXEC, + [VM_SHARED | VM_EXEC | VM_READ] = PAGE_READ_EXEC, + [VM_SHARED | VM_EXEC | VM_WRITE] = PAGE_SHARED_EXEC, + [VM_SHARED | VM_EXEC | VM_WRITE | VM_READ] = PAGE_SHARED_EXEC +}; +DECLARE_VM_GET_PAGE_PROT + +void __set_fixmap(enum fixed_addresses idx, phys_addr_t phys, pgprot_t prot) +{ + unsigned long addr = __fix_to_virt(idx); + pte_t *ptep; + + BUG_ON(idx <= FIX_HOLE || idx >= __end_of_fixed_addresses); + + ptep = &fixmap_pte[pte_index(addr)]; + + if (pgprot_val(prot)) + set_pte(ptep, pfn_pte(phys >> PAGE_SHIFT, prot)); + else + pte_clear(&init_mm, addr, ptep); + local_flush_tlb_page(addr); +} + +static inline pte_t *__init get_pte_virt_early(phys_addr_t pa) +{ + return (pte_t *)((uintptr_t)pa); +} + +static inline pte_t *__init get_pte_virt_fixmap(phys_addr_t pa) +{ + clear_fixmap(FIX_PTE); + return (pte_t *)set_fixmap_offset(FIX_PTE, pa); +} + +static inline pte_t *__init get_pte_virt_late(phys_addr_t pa) +{ + return (pte_t *) __va(pa); +} + +static inline phys_addr_t __init alloc_pte_early(uintptr_t va) +{ + /* + * We only create PMD or PGD early mappings so we + * should never reach here with MMU disabled. + */ + BUG(); +} + +static inline phys_addr_t __init alloc_pte_fixmap(uintptr_t va) +{ + return memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE); +} + +static phys_addr_t __init alloc_pte_late(uintptr_t va) +{ + struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL & ~__GFP_HIGHMEM, 0); + + BUG_ON(!ptdesc || !pagetable_pte_ctor(ptdesc)); + return __pa((pte_t *)ptdesc_address(ptdesc)); +} + +static void __init create_pte_mapping(pte_t *ptep, + uintptr_t va, phys_addr_t pa, + phys_addr_t sz, pgprot_t prot) +{ + uintptr_t pte_idx = pte_index(va); + + BUG_ON(sz != PAGE_SIZE); + + if (pte_none(ptep[pte_idx])) + ptep[pte_idx] = pfn_pte(PFN_DOWN(pa), prot); +} + +#ifndef __PAGETABLE_PMD_FOLDED + +static pmd_t trampoline_pmd[PTRS_PER_PMD] __page_aligned_bss; +static pmd_t fixmap_pmd[PTRS_PER_PMD] __page_aligned_bss; +static pmd_t early_pmd[PTRS_PER_PMD] __initdata __aligned(PAGE_SIZE); + +#ifdef CONFIG_XIP_KERNEL +#define trampoline_pmd ((pmd_t *)XIP_FIXUP(trampoline_pmd)) +#define fixmap_pmd ((pmd_t *)XIP_FIXUP(fixmap_pmd)) +#define early_pmd ((pmd_t *)XIP_FIXUP(early_pmd)) +#endif /* CONFIG_XIP_KERNEL */ + +static p4d_t trampoline_p4d[PTRS_PER_P4D] __page_aligned_bss; +static p4d_t fixmap_p4d[PTRS_PER_P4D] __page_aligned_bss; +static p4d_t early_p4d[PTRS_PER_P4D] __initdata __aligned(PAGE_SIZE); + +#ifdef CONFIG_XIP_KERNEL +#define trampoline_p4d ((p4d_t *)XIP_FIXUP(trampoline_p4d)) +#define fixmap_p4d ((p4d_t *)XIP_FIXUP(fixmap_p4d)) +#define early_p4d ((p4d_t *)XIP_FIXUP(early_p4d)) +#endif /* CONFIG_XIP_KERNEL */ + +static pud_t trampoline_pud[PTRS_PER_PUD] __page_aligned_bss; +static pud_t fixmap_pud[PTRS_PER_PUD] __page_aligned_bss; +static pud_t early_pud[PTRS_PER_PUD] __initdata __aligned(PAGE_SIZE); + +#ifdef CONFIG_XIP_KERNEL +#define trampoline_pud ((pud_t *)XIP_FIXUP(trampoline_pud)) +#define fixmap_pud ((pud_t *)XIP_FIXUP(fixmap_pud)) +#define early_pud ((pud_t *)XIP_FIXUP(early_pud)) +#endif /* CONFIG_XIP_KERNEL */ + +static pmd_t *__init get_pmd_virt_early(phys_addr_t pa) +{ + /* Before MMU is enabled */ + return (pmd_t *)((uintptr_t)pa); +} + +static pmd_t *__init get_pmd_virt_fixmap(phys_addr_t pa) +{ + clear_fixmap(FIX_PMD); + return (pmd_t *)set_fixmap_offset(FIX_PMD, pa); +} + +static pmd_t *__init get_pmd_virt_late(phys_addr_t pa) +{ + return (pmd_t *) __va(pa); +} + +static phys_addr_t __init alloc_pmd_early(uintptr_t va) +{ + BUG_ON((va - kernel_map.virt_addr) >> PUD_SHIFT); + + return (uintptr_t)early_pmd; +} + +static phys_addr_t __init alloc_pmd_fixmap(uintptr_t va) +{ + return memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE); +} + +static phys_addr_t __init alloc_pmd_late(uintptr_t va) +{ + struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL & ~__GFP_HIGHMEM, 0); + + BUG_ON(!ptdesc || !pagetable_pmd_ctor(ptdesc)); + return __pa((pmd_t *)ptdesc_address(ptdesc)); +} + +static void __init create_pmd_mapping(pmd_t *pmdp, + uintptr_t va, phys_addr_t pa, + phys_addr_t sz, pgprot_t prot) +{ + pte_t *ptep; + phys_addr_t pte_phys; + uintptr_t pmd_idx = pmd_index(va); + + if (sz == PMD_SIZE) { + if (pmd_none(pmdp[pmd_idx])) + pmdp[pmd_idx] = pfn_pmd(PFN_DOWN(pa), prot); + return; + } + + if (pmd_none(pmdp[pmd_idx])) { + pte_phys = pt_ops.alloc_pte(va); + pmdp[pmd_idx] = pfn_pmd(PFN_DOWN(pte_phys), PAGE_TABLE); + ptep = pt_ops.get_pte_virt(pte_phys); + memset(ptep, 0, PAGE_SIZE); + } else { + pte_phys = PFN_PHYS(_pmd_pfn(pmdp[pmd_idx])); + ptep = pt_ops.get_pte_virt(pte_phys); + } + + create_pte_mapping(ptep, va, pa, sz, prot); +} + +static pud_t *__init get_pud_virt_early(phys_addr_t pa) +{ + return (pud_t *)((uintptr_t)pa); +} + +static pud_t *__init get_pud_virt_fixmap(phys_addr_t pa) +{ + clear_fixmap(FIX_PUD); + return (pud_t *)set_fixmap_offset(FIX_PUD, pa); +} + +static pud_t *__init get_pud_virt_late(phys_addr_t pa) +{ + return (pud_t *)__va(pa); +} + +static phys_addr_t __init alloc_pud_early(uintptr_t va) +{ + /* Only one PUD is available for early mapping */ + BUG_ON((va - kernel_map.virt_addr) >> PGDIR_SHIFT); + + return (uintptr_t)early_pud; +} + +static phys_addr_t __init alloc_pud_fixmap(uintptr_t va) +{ + return memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE); +} + +static phys_addr_t alloc_pud_late(uintptr_t va) +{ + unsigned long vaddr; + + vaddr = __get_free_page(GFP_KERNEL); + BUG_ON(!vaddr); + return __pa(vaddr); +} + +static p4d_t *__init get_p4d_virt_early(phys_addr_t pa) +{ + return (p4d_t *)((uintptr_t)pa); +} + +static p4d_t *__init get_p4d_virt_fixmap(phys_addr_t pa) +{ + clear_fixmap(FIX_P4D); + return (p4d_t *)set_fixmap_offset(FIX_P4D, pa); +} + +static p4d_t *__init get_p4d_virt_late(phys_addr_t pa) +{ + return (p4d_t *)__va(pa); +} + +static phys_addr_t __init alloc_p4d_early(uintptr_t va) +{ + /* Only one P4D is available for early mapping */ + BUG_ON((va - kernel_map.virt_addr) >> PGDIR_SHIFT); + + return (uintptr_t)early_p4d; +} + +static phys_addr_t __init alloc_p4d_fixmap(uintptr_t va) +{ + return memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE); +} + +static phys_addr_t alloc_p4d_late(uintptr_t va) +{ + unsigned long vaddr; + + vaddr = __get_free_page(GFP_KERNEL); + BUG_ON(!vaddr); + return __pa(vaddr); +} + +static void __init create_pud_mapping(pud_t *pudp, + uintptr_t va, phys_addr_t pa, + phys_addr_t sz, pgprot_t prot) +{ + pmd_t *nextp; + phys_addr_t next_phys; + uintptr_t pud_index = pud_index(va); + + if (sz == PUD_SIZE) { + if (pud_val(pudp[pud_index]) == 0) + pudp[pud_index] = pfn_pud(PFN_DOWN(pa), prot); + return; + } + + if (pud_val(pudp[pud_index]) == 0) { + next_phys = pt_ops.alloc_pmd(va); + pudp[pud_index] = pfn_pud(PFN_DOWN(next_phys), PAGE_TABLE); + nextp = pt_ops.get_pmd_virt(next_phys); + memset(nextp, 0, PAGE_SIZE); + } else { + next_phys = PFN_PHYS(_pud_pfn(pudp[pud_index])); + nextp = pt_ops.get_pmd_virt(next_phys); + } + + create_pmd_mapping(nextp, va, pa, sz, prot); +} + +static void __init create_p4d_mapping(p4d_t *p4dp, + uintptr_t va, phys_addr_t pa, + phys_addr_t sz, pgprot_t prot) +{ + pud_t *nextp; + phys_addr_t next_phys; + uintptr_t p4d_index = p4d_index(va); + + if (sz == P4D_SIZE) { + if (p4d_val(p4dp[p4d_index]) == 0) + p4dp[p4d_index] = pfn_p4d(PFN_DOWN(pa), prot); + return; + } + + if (p4d_val(p4dp[p4d_index]) == 0) { + next_phys = pt_ops.alloc_pud(va); + p4dp[p4d_index] = pfn_p4d(PFN_DOWN(next_phys), PAGE_TABLE); + nextp = pt_ops.get_pud_virt(next_phys); + memset(nextp, 0, PAGE_SIZE); + } else { + next_phys = PFN_PHYS(_p4d_pfn(p4dp[p4d_index])); + nextp = pt_ops.get_pud_virt(next_phys); + } + + create_pud_mapping(nextp, va, pa, sz, prot); +} + +#define pgd_next_t p4d_t +#define alloc_pgd_next(__va) (pgtable_l5_enabled ? \ + pt_ops.alloc_p4d(__va) : (pgtable_l4_enabled ? \ + pt_ops.alloc_pud(__va) : pt_ops.alloc_pmd(__va))) +#define get_pgd_next_virt(__pa) (pgtable_l5_enabled ? \ + pt_ops.get_p4d_virt(__pa) : (pgd_next_t *)(pgtable_l4_enabled ? \ + pt_ops.get_pud_virt(__pa) : (pud_t *)pt_ops.get_pmd_virt(__pa))) +#define create_pgd_next_mapping(__nextp, __va, __pa, __sz, __prot) \ + (pgtable_l5_enabled ? \ + create_p4d_mapping(__nextp, __va, __pa, __sz, __prot) : \ + (pgtable_l4_enabled ? \ + create_pud_mapping((pud_t *)__nextp, __va, __pa, __sz, __prot) : \ + create_pmd_mapping((pmd_t *)__nextp, __va, __pa, __sz, __prot))) +#define fixmap_pgd_next (pgtable_l5_enabled ? \ + (uintptr_t)fixmap_p4d : (pgtable_l4_enabled ? \ + (uintptr_t)fixmap_pud : (uintptr_t)fixmap_pmd)) +#define trampoline_pgd_next (pgtable_l5_enabled ? \ + (uintptr_t)trampoline_p4d : (pgtable_l4_enabled ? \ + (uintptr_t)trampoline_pud : (uintptr_t)trampoline_pmd)) +#else +#define pgd_next_t pte_t +#define alloc_pgd_next(__va) pt_ops.alloc_pte(__va) +#define get_pgd_next_virt(__pa) pt_ops.get_pte_virt(__pa) +#define create_pgd_next_mapping(__nextp, __va, __pa, __sz, __prot) \ + create_pte_mapping(__nextp, __va, __pa, __sz, __prot) +#define fixmap_pgd_next ((uintptr_t)fixmap_pte) +#define create_p4d_mapping(__pmdp, __va, __pa, __sz, __prot) do {} while(0) +#define create_pud_mapping(__pmdp, __va, __pa, __sz, __prot) do {} while(0) +#define create_pmd_mapping(__pmdp, __va, __pa, __sz, __prot) do {} while(0) +#endif /* __PAGETABLE_PMD_FOLDED */ + +void __init create_pgd_mapping(pgd_t *pgdp, + uintptr_t va, phys_addr_t pa, + phys_addr_t sz, pgprot_t prot) +{ + pgd_next_t *nextp; + phys_addr_t next_phys; + uintptr_t pgd_idx = pgd_index(va); + + if (sz == PGDIR_SIZE) { + if (pgd_val(pgdp[pgd_idx]) == 0) + pgdp[pgd_idx] = pfn_pgd(PFN_DOWN(pa), prot); + return; + } + + if (pgd_val(pgdp[pgd_idx]) == 0) { + next_phys = alloc_pgd_next(va); + pgdp[pgd_idx] = pfn_pgd(PFN_DOWN(next_phys), PAGE_TABLE); + nextp = get_pgd_next_virt(next_phys); + memset(nextp, 0, PAGE_SIZE); + } else { + next_phys = PFN_PHYS(_pgd_pfn(pgdp[pgd_idx])); + nextp = get_pgd_next_virt(next_phys); + } + + create_pgd_next_mapping(nextp, va, pa, sz, prot); +} + +static uintptr_t __init best_map_size(phys_addr_t pa, uintptr_t va, + phys_addr_t size) +{ + if (!(pa & (PGDIR_SIZE - 1)) && !(va & (PGDIR_SIZE - 1)) && size >= PGDIR_SIZE) + return PGDIR_SIZE; + + if (!(pa & (P4D_SIZE - 1)) && !(va & (P4D_SIZE - 1)) && size >= P4D_SIZE) + return P4D_SIZE; + + if (!(pa & (PUD_SIZE - 1)) && !(va & (PUD_SIZE - 1)) && size >= PUD_SIZE) + return PUD_SIZE; + + if (!(pa & (PMD_SIZE - 1)) && !(va & (PMD_SIZE - 1)) && size >= PMD_SIZE) + return PMD_SIZE; + + return PAGE_SIZE; +} + +#ifdef CONFIG_XIP_KERNEL +#define phys_ram_base (*(phys_addr_t *)XIP_FIXUP(&phys_ram_base)) +extern char _xiprom[], _exiprom[], __data_loc; + +/* called from head.S with MMU off */ +asmlinkage void __init __copy_data(void) +{ + void *from = (void *)(&__data_loc); + void *to = (void *)CONFIG_PHYS_RAM_BASE; + size_t sz = (size_t)((uintptr_t)(&_end) - (uintptr_t)(&_sdata)); + + memcpy(to, from, sz); +} +#endif + +#ifdef CONFIG_STRICT_KERNEL_RWX +static __init pgprot_t pgprot_from_va(uintptr_t va) +{ + if (is_va_kernel_text(va)) + return PAGE_KERNEL_READ_EXEC; + + /* + * In 64-bit kernel, the kernel mapping is outside the linear mapping so + * we must protect its linear mapping alias from being executed and + * written. + * And rodata section is marked readonly in mark_rodata_ro. + */ + if (IS_ENABLED(CONFIG_64BIT) && is_va_kernel_lm_alias_text(va)) + return PAGE_KERNEL_READ; + + return PAGE_KERNEL; +} + +void mark_rodata_ro(void) +{ + set_kernel_memory(__start_rodata, _data, set_memory_ro); + if (IS_ENABLED(CONFIG_64BIT)) + set_kernel_memory(lm_alias(__start_rodata), lm_alias(_data), + set_memory_ro); + + debug_checkwx(); +} +#else +static __init pgprot_t pgprot_from_va(uintptr_t va) +{ + if (IS_ENABLED(CONFIG_64BIT) && !is_kernel_mapping(va)) + return PAGE_KERNEL; + + return PAGE_KERNEL_EXEC; +} +#endif /* CONFIG_STRICT_KERNEL_RWX */ + +#if defined(CONFIG_64BIT) && !defined(CONFIG_XIP_KERNEL) +u64 __pi_set_satp_mode_from_cmdline(uintptr_t dtb_pa); + +static void __init disable_pgtable_l5(void) +{ + pgtable_l5_enabled = false; + kernel_map.page_offset = PAGE_OFFSET_L4; + satp_mode = SATP_MODE_48; +} + +static void __init disable_pgtable_l4(void) +{ + pgtable_l4_enabled = false; + kernel_map.page_offset = PAGE_OFFSET_L3; + satp_mode = SATP_MODE_39; +} + +static int __init print_no4lvl(char *p) +{ + pr_info("Disabled 4-level and 5-level paging"); + return 0; +} +early_param("no4lvl", print_no4lvl); + +static int __init print_no5lvl(char *p) +{ + pr_info("Disabled 5-level paging"); + return 0; +} +early_param("no5lvl", print_no5lvl); + +/* + * There is a simple way to determine if 4-level is supported by the + * underlying hardware: establish 1:1 mapping in 4-level page table mode + * then read SATP to see if the configuration was taken into account + * meaning sv48 is supported. + */ +static __init void set_satp_mode(uintptr_t dtb_pa) +{ + u64 identity_satp, hw_satp; + uintptr_t set_satp_mode_pmd = ((unsigned long)set_satp_mode) & PMD_MASK; + u64 satp_mode_cmdline = __pi_set_satp_mode_from_cmdline(dtb_pa); + + if (satp_mode_cmdline == SATP_MODE_57) { + disable_pgtable_l5(); + } else if (satp_mode_cmdline == SATP_MODE_48) { + disable_pgtable_l5(); + disable_pgtable_l4(); + return; + } + + create_p4d_mapping(early_p4d, + set_satp_mode_pmd, (uintptr_t)early_pud, + P4D_SIZE, PAGE_TABLE); + create_pud_mapping(early_pud, + set_satp_mode_pmd, (uintptr_t)early_pmd, + PUD_SIZE, PAGE_TABLE); + /* Handle the case where set_satp_mode straddles 2 PMDs */ + create_pmd_mapping(early_pmd, + set_satp_mode_pmd, set_satp_mode_pmd, + PMD_SIZE, PAGE_KERNEL_EXEC); + create_pmd_mapping(early_pmd, + set_satp_mode_pmd + PMD_SIZE, + set_satp_mode_pmd + PMD_SIZE, + PMD_SIZE, PAGE_KERNEL_EXEC); +retry: + create_pgd_mapping(early_pg_dir, + set_satp_mode_pmd, + pgtable_l5_enabled ? + (uintptr_t)early_p4d : (uintptr_t)early_pud, + PGDIR_SIZE, PAGE_TABLE); + + identity_satp = PFN_DOWN((uintptr_t)&early_pg_dir) | satp_mode; + + local_flush_tlb_all(); + csr_write(CSR_SATP, identity_satp); + hw_satp = csr_swap(CSR_SATP, 0ULL); + local_flush_tlb_all(); + + if (hw_satp != identity_satp) { + if (pgtable_l5_enabled) { + disable_pgtable_l5(); + memset(early_pg_dir, 0, PAGE_SIZE); + goto retry; + } + disable_pgtable_l4(); + } + + memset(early_pg_dir, 0, PAGE_SIZE); + memset(early_p4d, 0, PAGE_SIZE); + memset(early_pud, 0, PAGE_SIZE); + memset(early_pmd, 0, PAGE_SIZE); +} +#endif + +/* + * setup_vm() is called from head.S with MMU-off. + * + * Following requirements should be honoured for setup_vm() to work + * correctly: + * 1) It should use PC-relative addressing for accessing kernel symbols. + * To achieve this we always use GCC cmodel=medany. + * 2) The compiler instrumentation for FTRACE will not work for setup_vm() + * so disable compiler instrumentation when FTRACE is enabled. + * + * Currently, the above requirements are honoured by using custom CFLAGS + * for init.o in mm/Makefile. + */ + +#ifndef __riscv_cmodel_medany +#error "setup_vm() is called from head.S before relocate so it should not use absolute addressing." +#endif + +#ifdef CONFIG_RELOCATABLE +extern unsigned long __rela_dyn_start, __rela_dyn_end; + +static void __init relocate_kernel(void) +{ + Elf64_Rela *rela = (Elf64_Rela *)&__rela_dyn_start; + /* + * This holds the offset between the linked virtual address and the + * relocated virtual address. + */ + uintptr_t reloc_offset = kernel_map.virt_addr - KERNEL_LINK_ADDR; + /* + * This holds the offset between kernel linked virtual address and + * physical address. + */ + uintptr_t va_kernel_link_pa_offset = KERNEL_LINK_ADDR - kernel_map.phys_addr; + + for ( ; rela < (Elf64_Rela *)&__rela_dyn_end; rela++) { + Elf64_Addr addr = (rela->r_offset - va_kernel_link_pa_offset); + Elf64_Addr relocated_addr = rela->r_addend; + + if (rela->r_info != R_RISCV_RELATIVE) + continue; + + /* + * Make sure to not relocate vdso symbols like rt_sigreturn + * which are linked from the address 0 in vmlinux since + * vdso symbol addresses are actually used as an offset from + * mm->context.vdso in VDSO_OFFSET macro. + */ + if (relocated_addr >= KERNEL_LINK_ADDR) + relocated_addr += reloc_offset; + + *(Elf64_Addr *)addr = relocated_addr; + } +} +#endif /* CONFIG_RELOCATABLE */ + +#ifdef CONFIG_XIP_KERNEL +static void __init create_kernel_page_table(pgd_t *pgdir, + __always_unused bool early) +{ + uintptr_t va, end_va; + + /* Map the flash resident part */ + end_va = kernel_map.virt_addr + kernel_map.xiprom_sz; + for (va = kernel_map.virt_addr; va < end_va; va += PMD_SIZE) + create_pgd_mapping(pgdir, va, + kernel_map.xiprom + (va - kernel_map.virt_addr), + PMD_SIZE, PAGE_KERNEL_EXEC); + + /* Map the data in RAM */ + end_va = kernel_map.virt_addr + XIP_OFFSET + kernel_map.size; + for (va = kernel_map.virt_addr + XIP_OFFSET; va < end_va; va += PMD_SIZE) + create_pgd_mapping(pgdir, va, + kernel_map.phys_addr + (va - (kernel_map.virt_addr + XIP_OFFSET)), + PMD_SIZE, PAGE_KERNEL); +} +#else +static void __init create_kernel_page_table(pgd_t *pgdir, bool early) +{ + uintptr_t va, end_va; + + end_va = kernel_map.virt_addr + kernel_map.size; + for (va = kernel_map.virt_addr; va < end_va; va += PMD_SIZE) + create_pgd_mapping(pgdir, va, + kernel_map.phys_addr + (va - kernel_map.virt_addr), + PMD_SIZE, + early ? + PAGE_KERNEL_EXEC : pgprot_from_va(va)); +} +#endif + +/* + * Setup a 4MB mapping that encompasses the device tree: for 64-bit kernel, + * this means 2 PMD entries whereas for 32-bit kernel, this is only 1 PGDIR + * entry. + */ +static void __init create_fdt_early_page_table(uintptr_t fix_fdt_va, + uintptr_t dtb_pa) +{ +#ifndef CONFIG_BUILTIN_DTB + uintptr_t pa = dtb_pa & ~(PMD_SIZE - 1); + + /* Make sure the fdt fixmap address is always aligned on PMD size */ + BUILD_BUG_ON(FIX_FDT % (PMD_SIZE / PAGE_SIZE)); + + /* In 32-bit only, the fdt lies in its own PGD */ + if (!IS_ENABLED(CONFIG_64BIT)) { + create_pgd_mapping(early_pg_dir, fix_fdt_va, + pa, MAX_FDT_SIZE, PAGE_KERNEL); + } else { + create_pmd_mapping(fixmap_pmd, fix_fdt_va, + pa, PMD_SIZE, PAGE_KERNEL); + create_pmd_mapping(fixmap_pmd, fix_fdt_va + PMD_SIZE, + pa + PMD_SIZE, PMD_SIZE, PAGE_KERNEL); + } + + dtb_early_va = (void *)fix_fdt_va + (dtb_pa & (PMD_SIZE - 1)); +#else + /* + * For 64-bit kernel, __va can't be used since it would return a linear + * mapping address whereas dtb_early_va will be used before + * setup_vm_final installs the linear mapping. For 32-bit kernel, as the + * kernel is mapped in the linear mapping, that makes no difference. + */ + dtb_early_va = kernel_mapping_pa_to_va(XIP_FIXUP(dtb_pa)); +#endif + + dtb_early_pa = dtb_pa; +} + +/* + * MMU is not enabled, the page tables are allocated directly using + * early_pmd/pud/p4d and the address returned is the physical one. + */ +static void __init pt_ops_set_early(void) +{ + pt_ops.alloc_pte = alloc_pte_early; + pt_ops.get_pte_virt = get_pte_virt_early; +#ifndef __PAGETABLE_PMD_FOLDED + pt_ops.alloc_pmd = alloc_pmd_early; + pt_ops.get_pmd_virt = get_pmd_virt_early; + pt_ops.alloc_pud = alloc_pud_early; + pt_ops.get_pud_virt = get_pud_virt_early; + pt_ops.alloc_p4d = alloc_p4d_early; + pt_ops.get_p4d_virt = get_p4d_virt_early; +#endif +} + +/* + * MMU is enabled but page table setup is not complete yet. + * fixmap page table alloc functions must be used as a means to temporarily + * map the allocated physical pages since the linear mapping does not exist yet. + * + * Note that this is called with MMU disabled, hence kernel_mapping_pa_to_va, + * but it will be used as described above. + */ +static void __init pt_ops_set_fixmap(void) +{ + pt_ops.alloc_pte = kernel_mapping_pa_to_va(alloc_pte_fixmap); + pt_ops.get_pte_virt = kernel_mapping_pa_to_va(get_pte_virt_fixmap); +#ifndef __PAGETABLE_PMD_FOLDED + pt_ops.alloc_pmd = kernel_mapping_pa_to_va(alloc_pmd_fixmap); + pt_ops.get_pmd_virt = kernel_mapping_pa_to_va(get_pmd_virt_fixmap); + pt_ops.alloc_pud = kernel_mapping_pa_to_va(alloc_pud_fixmap); + pt_ops.get_pud_virt = kernel_mapping_pa_to_va(get_pud_virt_fixmap); + pt_ops.alloc_p4d = kernel_mapping_pa_to_va(alloc_p4d_fixmap); + pt_ops.get_p4d_virt = kernel_mapping_pa_to_va(get_p4d_virt_fixmap); +#endif +} + +/* + * MMU is enabled and page table setup is complete, so from now, we can use + * generic page allocation functions to setup page table. + */ +static void __init pt_ops_set_late(void) +{ + pt_ops.alloc_pte = alloc_pte_late; + pt_ops.get_pte_virt = get_pte_virt_late; +#ifndef __PAGETABLE_PMD_FOLDED + pt_ops.alloc_pmd = alloc_pmd_late; + pt_ops.get_pmd_virt = get_pmd_virt_late; + pt_ops.alloc_pud = alloc_pud_late; + pt_ops.get_pud_virt = get_pud_virt_late; + pt_ops.alloc_p4d = alloc_p4d_late; + pt_ops.get_p4d_virt = get_p4d_virt_late; +#endif +} + +#ifdef CONFIG_RANDOMIZE_BASE +extern bool __init __pi_set_nokaslr_from_cmdline(uintptr_t dtb_pa); +extern u64 __init __pi_get_kaslr_seed(uintptr_t dtb_pa); + +static int __init print_nokaslr(char *p) +{ + pr_info("Disabled KASLR"); + return 0; +} +early_param("nokaslr", print_nokaslr); + +unsigned long kaslr_offset(void) +{ + return kernel_map.virt_offset; +} +#endif + +asmlinkage void __init setup_vm(uintptr_t dtb_pa) +{ + pmd_t __maybe_unused fix_bmap_spmd, fix_bmap_epmd; + +#ifdef CONFIG_RANDOMIZE_BASE + if (!__pi_set_nokaslr_from_cmdline(dtb_pa)) { + u64 kaslr_seed = __pi_get_kaslr_seed(dtb_pa); + u32 kernel_size = (uintptr_t)(&_end) - (uintptr_t)(&_start); + u32 nr_pos; + + /* + * Compute the number of positions available: we are limited + * by the early page table that only has one PUD and we must + * be aligned on PMD_SIZE. + */ + nr_pos = (PUD_SIZE - kernel_size) / PMD_SIZE; + + kernel_map.virt_offset = (kaslr_seed % nr_pos) * PMD_SIZE; + } +#endif + + kernel_map.virt_addr = KERNEL_LINK_ADDR + kernel_map.virt_offset; + kernel_map.page_offset = _AC(CONFIG_PAGE_OFFSET, UL); + +#ifdef CONFIG_XIP_KERNEL + kernel_map.xiprom = (uintptr_t)CONFIG_XIP_PHYS_ADDR; + kernel_map.xiprom_sz = (uintptr_t)(&_exiprom) - (uintptr_t)(&_xiprom); + + phys_ram_base = CONFIG_PHYS_RAM_BASE; + kernel_map.phys_addr = (uintptr_t)CONFIG_PHYS_RAM_BASE; + kernel_map.size = (uintptr_t)(&_end) - (uintptr_t)(&_sdata); + + kernel_map.va_kernel_xip_pa_offset = kernel_map.virt_addr - kernel_map.xiprom; +#else + kernel_map.phys_addr = (uintptr_t)(&_start); + kernel_map.size = (uintptr_t)(&_end) - kernel_map.phys_addr; +#endif + +#if defined(CONFIG_64BIT) && !defined(CONFIG_XIP_KERNEL) + set_satp_mode(dtb_pa); +#endif + + /* + * In 64-bit, we defer the setup of va_pa_offset to setup_bootmem, + * where we have the system memory layout: this allows us to align + * the physical and virtual mappings and then make use of PUD/P4D/PGD + * for the linear mapping. This is only possible because the kernel + * mapping lies outside the linear mapping. + * In 32-bit however, as the kernel resides in the linear mapping, + * setup_vm_final can not change the mapping established here, + * otherwise the same kernel addresses would get mapped to different + * physical addresses (if the start of dram is different from the + * kernel physical address start). + */ + kernel_map.va_pa_offset = IS_ENABLED(CONFIG_64BIT) ? + 0UL : PAGE_OFFSET - kernel_map.phys_addr; + kernel_map.va_kernel_pa_offset = kernel_map.virt_addr - kernel_map.phys_addr; + + /* + * The default maximal physical memory size is KERN_VIRT_SIZE for 32-bit + * kernel, whereas for 64-bit kernel, the end of the virtual address + * space is occupied by the modules/BPF/kernel mappings which reduces + * the available size of the linear mapping. + */ + memory_limit = KERN_VIRT_SIZE - (IS_ENABLED(CONFIG_64BIT) ? SZ_4G : 0); + + /* Sanity check alignment and size */ + BUG_ON((PAGE_OFFSET % PGDIR_SIZE) != 0); + BUG_ON((kernel_map.phys_addr % PMD_SIZE) != 0); + +#ifdef CONFIG_64BIT + /* + * The last 4K bytes of the addressable memory can not be mapped because + * of IS_ERR_VALUE macro. + */ + BUG_ON((kernel_map.virt_addr + kernel_map.size) > ADDRESS_SPACE_END - SZ_4K); +#endif + +#ifdef CONFIG_RELOCATABLE + /* + * Early page table uses only one PUD, which makes it possible + * to map PUD_SIZE aligned on PUD_SIZE: if the relocation offset + * makes the kernel cross over a PUD_SIZE boundary, raise a bug + * since a part of the kernel would not get mapped. + */ + BUG_ON(PUD_SIZE - (kernel_map.virt_addr & (PUD_SIZE - 1)) < kernel_map.size); + relocate_kernel(); +#endif + + apply_early_boot_alternatives(); + pt_ops_set_early(); + + /* Setup early PGD for fixmap */ + create_pgd_mapping(early_pg_dir, FIXADDR_START, + fixmap_pgd_next, PGDIR_SIZE, PAGE_TABLE); + +#ifndef __PAGETABLE_PMD_FOLDED + /* Setup fixmap P4D and PUD */ + if (pgtable_l5_enabled) + create_p4d_mapping(fixmap_p4d, FIXADDR_START, + (uintptr_t)fixmap_pud, P4D_SIZE, PAGE_TABLE); + /* Setup fixmap PUD and PMD */ + if (pgtable_l4_enabled) + create_pud_mapping(fixmap_pud, FIXADDR_START, + (uintptr_t)fixmap_pmd, PUD_SIZE, PAGE_TABLE); + create_pmd_mapping(fixmap_pmd, FIXADDR_START, + (uintptr_t)fixmap_pte, PMD_SIZE, PAGE_TABLE); + /* Setup trampoline PGD and PMD */ + create_pgd_mapping(trampoline_pg_dir, kernel_map.virt_addr, + trampoline_pgd_next, PGDIR_SIZE, PAGE_TABLE); + if (pgtable_l5_enabled) + create_p4d_mapping(trampoline_p4d, kernel_map.virt_addr, + (uintptr_t)trampoline_pud, P4D_SIZE, PAGE_TABLE); + if (pgtable_l4_enabled) + create_pud_mapping(trampoline_pud, kernel_map.virt_addr, + (uintptr_t)trampoline_pmd, PUD_SIZE, PAGE_TABLE); +#ifdef CONFIG_XIP_KERNEL + create_pmd_mapping(trampoline_pmd, kernel_map.virt_addr, + kernel_map.xiprom, PMD_SIZE, PAGE_KERNEL_EXEC); +#else + create_pmd_mapping(trampoline_pmd, kernel_map.virt_addr, + kernel_map.phys_addr, PMD_SIZE, PAGE_KERNEL_EXEC); +#endif +#else + /* Setup trampoline PGD */ + create_pgd_mapping(trampoline_pg_dir, kernel_map.virt_addr, + kernel_map.phys_addr, PGDIR_SIZE, PAGE_KERNEL_EXEC); +#endif + + /* + * Setup early PGD covering entire kernel which will allow + * us to reach paging_init(). We map all memory banks later + * in setup_vm_final() below. + */ + create_kernel_page_table(early_pg_dir, true); + + /* Setup early mapping for FDT early scan */ + create_fdt_early_page_table(__fix_to_virt(FIX_FDT), dtb_pa); + + /* + * Bootime fixmap only can handle PMD_SIZE mapping. Thus, boot-ioremap + * range can not span multiple pmds. + */ + BUG_ON((__fix_to_virt(FIX_BTMAP_BEGIN) >> PMD_SHIFT) + != (__fix_to_virt(FIX_BTMAP_END) >> PMD_SHIFT)); + +#ifndef __PAGETABLE_PMD_FOLDED + /* + * Early ioremap fixmap is already created as it lies within first 2MB + * of fixmap region. We always map PMD_SIZE. Thus, both FIX_BTMAP_END + * FIX_BTMAP_BEGIN should lie in the same pmd. Verify that and warn + * the user if not. + */ + fix_bmap_spmd = fixmap_pmd[pmd_index(__fix_to_virt(FIX_BTMAP_BEGIN))]; + fix_bmap_epmd = fixmap_pmd[pmd_index(__fix_to_virt(FIX_BTMAP_END))]; + if (pmd_val(fix_bmap_spmd) != pmd_val(fix_bmap_epmd)) { + WARN_ON(1); + pr_warn("fixmap btmap start [%08lx] != end [%08lx]\n", + pmd_val(fix_bmap_spmd), pmd_val(fix_bmap_epmd)); + pr_warn("fix_to_virt(FIX_BTMAP_BEGIN): %08lx\n", + fix_to_virt(FIX_BTMAP_BEGIN)); + pr_warn("fix_to_virt(FIX_BTMAP_END): %08lx\n", + fix_to_virt(FIX_BTMAP_END)); + + pr_warn("FIX_BTMAP_END: %d\n", FIX_BTMAP_END); + pr_warn("FIX_BTMAP_BEGIN: %d\n", FIX_BTMAP_BEGIN); + } +#endif + + pt_ops_set_fixmap(); +} + +static void __init create_linear_mapping_range(phys_addr_t start, + phys_addr_t end, + uintptr_t fixed_map_size) +{ + phys_addr_t pa; + uintptr_t va, map_size; + + for (pa = start; pa < end; pa += map_size) { + va = (uintptr_t)__va(pa); + map_size = fixed_map_size ? fixed_map_size : + best_map_size(pa, va, end - pa); + + create_pgd_mapping(swapper_pg_dir, va, pa, map_size, + pgprot_from_va(va)); + } +} + +static void __init create_linear_mapping_page_table(void) +{ + phys_addr_t start, end; + phys_addr_t kfence_pool __maybe_unused; + u64 i; + +#ifdef CONFIG_STRICT_KERNEL_RWX + phys_addr_t ktext_start = __pa_symbol(_start); + phys_addr_t ktext_size = __init_data_begin - _start; + phys_addr_t krodata_start = __pa_symbol(__start_rodata); + phys_addr_t krodata_size = _data - __start_rodata; + + /* Isolate kernel text and rodata so they don't get mapped with a PUD */ + memblock_mark_nomap(ktext_start, ktext_size); + memblock_mark_nomap(krodata_start, krodata_size); +#endif + +#ifdef CONFIG_KFENCE + /* + * kfence pool must be backed by PAGE_SIZE mappings, so allocate it + * before we setup the linear mapping so that we avoid using hugepages + * for this region. + */ + kfence_pool = memblock_phys_alloc(KFENCE_POOL_SIZE, PAGE_SIZE); + BUG_ON(!kfence_pool); + + memblock_mark_nomap(kfence_pool, KFENCE_POOL_SIZE); + __kfence_pool = __va(kfence_pool); +#endif + + /* Map all memory banks in the linear mapping */ + for_each_mem_range(i, &start, &end) { + if (start >= end) + break; + if (start <= __pa(PAGE_OFFSET) && + __pa(PAGE_OFFSET) < end) + start = __pa(PAGE_OFFSET); + if (end >= __pa(PAGE_OFFSET) + memory_limit) + end = __pa(PAGE_OFFSET) + memory_limit; + + create_linear_mapping_range(start, end, 0); + } + +#ifdef CONFIG_STRICT_KERNEL_RWX + create_linear_mapping_range(ktext_start, ktext_start + ktext_size, 0); + create_linear_mapping_range(krodata_start, + krodata_start + krodata_size, 0); + + memblock_clear_nomap(ktext_start, ktext_size); + memblock_clear_nomap(krodata_start, krodata_size); +#endif + +#ifdef CONFIG_KFENCE + create_linear_mapping_range(kfence_pool, + kfence_pool + KFENCE_POOL_SIZE, + PAGE_SIZE); + + memblock_clear_nomap(kfence_pool, KFENCE_POOL_SIZE); +#endif +} + +static void __init setup_vm_final(void) +{ + /* Setup swapper PGD for fixmap */ +#if !defined(CONFIG_64BIT) + /* + * In 32-bit, the device tree lies in a pgd entry, so it must be copied + * directly in swapper_pg_dir in addition to the pgd entry that points + * to fixmap_pte. + */ + unsigned long idx = pgd_index(__fix_to_virt(FIX_FDT)); + + set_pgd(&swapper_pg_dir[idx], early_pg_dir[idx]); +#endif + create_pgd_mapping(swapper_pg_dir, FIXADDR_START, + __pa_symbol(fixmap_pgd_next), + PGDIR_SIZE, PAGE_TABLE); + + /* Map the linear mapping */ + create_linear_mapping_page_table(); + + /* Map the kernel */ + if (IS_ENABLED(CONFIG_64BIT)) + create_kernel_page_table(swapper_pg_dir, false); + +#ifdef CONFIG_KASAN + kasan_swapper_init(); +#endif + + /* Clear fixmap PTE and PMD mappings */ + clear_fixmap(FIX_PTE); + clear_fixmap(FIX_PMD); + clear_fixmap(FIX_PUD); + clear_fixmap(FIX_P4D); + + /* Move to swapper page table */ + csr_write(CSR_SATP, PFN_DOWN(__pa_symbol(swapper_pg_dir)) | satp_mode); + local_flush_tlb_all(); + + pt_ops_set_late(); +} +#else +asmlinkage void __init setup_vm(uintptr_t dtb_pa) +{ + dtb_early_va = (void *)dtb_pa; + dtb_early_pa = dtb_pa; +} + +static inline void setup_vm_final(void) +{ +} +#endif /* CONFIG_MMU */ + +/* Reserve 128M low memory by default for swiotlb buffer */ +#define DEFAULT_CRASH_KERNEL_LOW_SIZE (128UL << 20) + +static int __init reserve_crashkernel_low(unsigned long long low_size) +{ + unsigned long long low_base; + + low_base = memblock_phys_alloc_range(low_size, PMD_SIZE, 0, dma32_phys_limit); + if (!low_base) { + pr_err("cannot allocate crashkernel low memory (size:0x%llx).\n", low_size); + return -ENOMEM; + } + + pr_info("crashkernel low memory reserved: 0x%016llx - 0x%016llx (%lld MB)\n", + low_base, low_base + low_size, low_size >> 20); + + crashk_low_res.start = low_base; + crashk_low_res.end = low_base + low_size - 1; + + return 0; +} + +/* + * reserve_crashkernel() - reserves memory for crash kernel + * + * This function reserves memory area given in "crashkernel=" kernel command + * line parameter. The memory reserved is used by dump capture kernel when + * primary kernel is crashing. + */ +static void __init reserve_crashkernel(void) +{ + unsigned long long crash_base = 0; + unsigned long long crash_size = 0; + unsigned long long crash_low_size = 0; + unsigned long search_start = memblock_start_of_DRAM(); + unsigned long search_end = (unsigned long)dma32_phys_limit; + char *cmdline = boot_command_line; + bool fixed_base = false; + bool high = false; + + int ret = 0; + + if (!IS_ENABLED(CONFIG_KEXEC_CORE)) + return; + /* + * Don't reserve a region for a crash kernel on a crash kernel + * since it doesn't make much sense and we have limited memory + * resources. + */ + if (is_kdump_kernel()) { + pr_info("crashkernel: ignoring reservation request\n"); + return; + } + + ret = parse_crashkernel(cmdline, memblock_phys_mem_size(), + &crash_size, &crash_base); + if (ret == -ENOENT) { + /* Fallback to crashkernel=X,[high,low] */ + ret = parse_crashkernel_high(cmdline, 0, &crash_size, &crash_base); + if (ret || !crash_size) + return; + + /* + * crashkernel=Y,low is valid only when crashkernel=X,high + * is passed. + */ + ret = parse_crashkernel_low(cmdline, 0, &crash_low_size, &crash_base); + if (ret == -ENOENT) + crash_low_size = DEFAULT_CRASH_KERNEL_LOW_SIZE; + else if (ret) + return; + + search_start = (unsigned long)dma32_phys_limit; + search_end = memblock_end_of_DRAM(); + high = true; + } else if (ret || !crash_size) { + /* Invalid argument value specified */ + return; + } + + crash_size = PAGE_ALIGN(crash_size); + + if (crash_base) { + fixed_base = true; + search_start = crash_base; + search_end = crash_base + crash_size; + } + + /* + * Current riscv boot protocol requires 2MB alignment for + * RV64 and 4MB alignment for RV32 (hugepage size) + * + * Try to alloc from 32bit addressible physical memory so that + * swiotlb can work on the crash kernel. + */ + crash_base = memblock_phys_alloc_range(crash_size, PMD_SIZE, + search_start, search_end); + if (crash_base == 0) { + /* + * For crashkernel=size[KMG]@offset[KMG], print out failure + * message if can't reserve the specified region. + */ + if (fixed_base) { + pr_warn("crashkernel: allocating failed with given size@offset\n"); + return; + } + + if (high) { + /* + * For crashkernel=size[KMG],high, if the first attempt was + * for high memory, fall back to low memory. + */ + search_start = memblock_start_of_DRAM(); + search_end = (unsigned long)dma32_phys_limit; + } else { + /* + * For crashkernel=size[KMG], if the first attempt was for + * low memory, fall back to high memory, the minimum required + * low memory will be reserved later. + */ + search_start = (unsigned long)dma32_phys_limit; + search_end = memblock_end_of_DRAM(); + crash_low_size = DEFAULT_CRASH_KERNEL_LOW_SIZE; + } + + crash_base = memblock_phys_alloc_range(crash_size, PMD_SIZE, + search_start, search_end); + if (crash_base == 0) { + pr_warn("crashkernel: couldn't allocate %lldKB\n", + crash_size >> 10); + return; + } + } + + if ((crash_base >= dma32_phys_limit) && crash_low_size && + reserve_crashkernel_low(crash_low_size)) { + memblock_phys_free(crash_base, crash_size); + return; + } + + pr_info("crashkernel: reserved 0x%016llx - 0x%016llx (%lld MB)\n", + crash_base, crash_base + crash_size, crash_size >> 20); + + crashk_res.start = crash_base; + crashk_res.end = crash_base + crash_size - 1; +} + +void __init paging_init(void) +{ + setup_bootmem(); + setup_vm_final(); + + /* Depend on that Linear Mapping is ready */ + memblock_allow_resize(); +} + +void __init misc_mem_init(void) +{ + early_memtest(min_low_pfn << PAGE_SHIFT, max_low_pfn << PAGE_SHIFT); + arch_numa_init(); + sparse_init(); + zone_sizes_init(); + reserve_crashkernel(); + memblock_dump_all(); +} + +#ifdef CONFIG_SPARSEMEM_VMEMMAP +int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, + struct vmem_altmap *altmap) +{ + return vmemmap_populate_basepages(start, end, node, NULL); +} +#endif + +#if defined(CONFIG_MMU) && defined(CONFIG_64BIT) +/* + * Pre-allocates page-table pages for a specific area in the kernel + * page-table. Only the level which needs to be synchronized between + * all page-tables is allocated because the synchronization can be + * expensive. + */ +static void __init preallocate_pgd_pages_range(unsigned long start, unsigned long end, + const char *area) +{ + unsigned long addr; + const char *lvl; + + for (addr = start; addr < end && addr >= start; addr = ALIGN(addr + 1, PGDIR_SIZE)) { + pgd_t *pgd = pgd_offset_k(addr); + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + + lvl = "p4d"; + p4d = p4d_alloc(&init_mm, pgd, addr); + if (!p4d) + goto failed; + + if (pgtable_l5_enabled) + continue; + + lvl = "pud"; + pud = pud_alloc(&init_mm, p4d, addr); + if (!pud) + goto failed; + + if (pgtable_l4_enabled) + continue; + + lvl = "pmd"; + pmd = pmd_alloc(&init_mm, pud, addr); + if (!pmd) + goto failed; + } + return; + +failed: + /* + * The pages have to be there now or they will be missing in + * process page-tables later. + */ + panic("Failed to pre-allocate %s pages for %s area\n", lvl, area); +} + +void __init pgtable_cache_init(void) +{ + preallocate_pgd_pages_range(VMALLOC_START, VMALLOC_END, "vmalloc"); + if (IS_ENABLED(CONFIG_MODULES)) + preallocate_pgd_pages_range(MODULES_VADDR, MODULES_END, "bpf/modules"); +} +#endif diff --git a/arch/riscv/mm/kasan_init.c b/arch/riscv/mm/kasan_init.c new file mode 100644 index 0000000000..5e39dcf23f --- /dev/null +++ b/arch/riscv/mm/kasan_init.c @@ -0,0 +1,525 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2019 Andes Technology Corporation + +#include <linux/pfn.h> +#include <linux/init_task.h> +#include <linux/kasan.h> +#include <linux/kernel.h> +#include <linux/memblock.h> +#include <linux/pgtable.h> +#include <asm/tlbflush.h> +#include <asm/fixmap.h> +#include <asm/pgalloc.h> + +/* + * Kasan shadow region must lie at a fixed address across sv39, sv48 and sv57 + * which is right before the kernel. + * + * For sv39, the region is aligned on PGDIR_SIZE so we only need to populate + * the page global directory with kasan_early_shadow_pmd. + * + * For sv48 and sv57, the region start is aligned on PGDIR_SIZE whereas the end + * region is not and then we have to go down to the PUD level. + */ + +static pgd_t tmp_pg_dir[PTRS_PER_PGD] __page_aligned_bss; +static p4d_t tmp_p4d[PTRS_PER_P4D] __page_aligned_bss; +static pud_t tmp_pud[PTRS_PER_PUD] __page_aligned_bss; + +static void __init kasan_populate_pte(pmd_t *pmd, unsigned long vaddr, unsigned long end) +{ + phys_addr_t phys_addr; + pte_t *ptep, *p; + + if (pmd_none(*pmd)) { + p = memblock_alloc(PTRS_PER_PTE * sizeof(pte_t), PAGE_SIZE); + set_pmd(pmd, pfn_pmd(PFN_DOWN(__pa(p)), PAGE_TABLE)); + } + + ptep = pte_offset_kernel(pmd, vaddr); + + do { + if (pte_none(*ptep)) { + phys_addr = memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE); + set_pte(ptep, pfn_pte(PFN_DOWN(phys_addr), PAGE_KERNEL)); + memset(__va(phys_addr), KASAN_SHADOW_INIT, PAGE_SIZE); + } + } while (ptep++, vaddr += PAGE_SIZE, vaddr != end); +} + +static void __init kasan_populate_pmd(pud_t *pud, unsigned long vaddr, unsigned long end) +{ + phys_addr_t phys_addr; + pmd_t *pmdp, *p; + unsigned long next; + + if (pud_none(*pud)) { + p = memblock_alloc(PTRS_PER_PMD * sizeof(pmd_t), PAGE_SIZE); + set_pud(pud, pfn_pud(PFN_DOWN(__pa(p)), PAGE_TABLE)); + } + + pmdp = pmd_offset(pud, vaddr); + + do { + next = pmd_addr_end(vaddr, end); + + if (pmd_none(*pmdp) && IS_ALIGNED(vaddr, PMD_SIZE) && (next - vaddr) >= PMD_SIZE) { + phys_addr = memblock_phys_alloc(PMD_SIZE, PMD_SIZE); + if (phys_addr) { + set_pmd(pmdp, pfn_pmd(PFN_DOWN(phys_addr), PAGE_KERNEL)); + memset(__va(phys_addr), KASAN_SHADOW_INIT, PMD_SIZE); + continue; + } + } + + kasan_populate_pte(pmdp, vaddr, next); + } while (pmdp++, vaddr = next, vaddr != end); +} + +static void __init kasan_populate_pud(p4d_t *p4d, + unsigned long vaddr, unsigned long end) +{ + phys_addr_t phys_addr; + pud_t *pudp, *p; + unsigned long next; + + if (p4d_none(*p4d)) { + p = memblock_alloc(PTRS_PER_PUD * sizeof(pud_t), PAGE_SIZE); + set_p4d(p4d, pfn_p4d(PFN_DOWN(__pa(p)), PAGE_TABLE)); + } + + pudp = pud_offset(p4d, vaddr); + + do { + next = pud_addr_end(vaddr, end); + + if (pud_none(*pudp) && IS_ALIGNED(vaddr, PUD_SIZE) && (next - vaddr) >= PUD_SIZE) { + phys_addr = memblock_phys_alloc(PUD_SIZE, PUD_SIZE); + if (phys_addr) { + set_pud(pudp, pfn_pud(PFN_DOWN(phys_addr), PAGE_KERNEL)); + memset(__va(phys_addr), KASAN_SHADOW_INIT, PUD_SIZE); + continue; + } + } + + kasan_populate_pmd(pudp, vaddr, next); + } while (pudp++, vaddr = next, vaddr != end); +} + +static void __init kasan_populate_p4d(pgd_t *pgd, + unsigned long vaddr, unsigned long end) +{ + phys_addr_t phys_addr; + p4d_t *p4dp, *p; + unsigned long next; + + if (pgd_none(*pgd)) { + p = memblock_alloc(PTRS_PER_P4D * sizeof(p4d_t), PAGE_SIZE); + set_pgd(pgd, pfn_pgd(PFN_DOWN(__pa(p)), PAGE_TABLE)); + } + + p4dp = p4d_offset(pgd, vaddr); + + do { + next = p4d_addr_end(vaddr, end); + + if (p4d_none(*p4dp) && IS_ALIGNED(vaddr, P4D_SIZE) && (next - vaddr) >= P4D_SIZE) { + phys_addr = memblock_phys_alloc(P4D_SIZE, P4D_SIZE); + if (phys_addr) { + set_p4d(p4dp, pfn_p4d(PFN_DOWN(phys_addr), PAGE_KERNEL)); + memset(__va(phys_addr), KASAN_SHADOW_INIT, P4D_SIZE); + continue; + } + } + + kasan_populate_pud(p4dp, vaddr, next); + } while (p4dp++, vaddr = next, vaddr != end); +} + +static void __init kasan_populate_pgd(pgd_t *pgdp, + unsigned long vaddr, unsigned long end) +{ + phys_addr_t phys_addr; + unsigned long next; + + do { + next = pgd_addr_end(vaddr, end); + + if (pgd_none(*pgdp) && IS_ALIGNED(vaddr, PGDIR_SIZE) && + (next - vaddr) >= PGDIR_SIZE) { + phys_addr = memblock_phys_alloc(PGDIR_SIZE, PGDIR_SIZE); + if (phys_addr) { + set_pgd(pgdp, pfn_pgd(PFN_DOWN(phys_addr), PAGE_KERNEL)); + memset(__va(phys_addr), KASAN_SHADOW_INIT, PGDIR_SIZE); + continue; + } + } + + kasan_populate_p4d(pgdp, vaddr, next); + } while (pgdp++, vaddr = next, vaddr != end); +} + +static void __init kasan_early_clear_pud(p4d_t *p4dp, + unsigned long vaddr, unsigned long end) +{ + pud_t *pudp, *base_pud; + unsigned long next; + + if (!pgtable_l4_enabled) { + pudp = (pud_t *)p4dp; + } else { + base_pud = pt_ops.get_pud_virt(pfn_to_phys(_p4d_pfn(*p4dp))); + pudp = base_pud + pud_index(vaddr); + } + + do { + next = pud_addr_end(vaddr, end); + + if (IS_ALIGNED(vaddr, PUD_SIZE) && (next - vaddr) >= PUD_SIZE) { + pud_clear(pudp); + continue; + } + + BUG(); + } while (pudp++, vaddr = next, vaddr != end); +} + +static void __init kasan_early_clear_p4d(pgd_t *pgdp, + unsigned long vaddr, unsigned long end) +{ + p4d_t *p4dp, *base_p4d; + unsigned long next; + + if (!pgtable_l5_enabled) { + p4dp = (p4d_t *)pgdp; + } else { + base_p4d = pt_ops.get_p4d_virt(pfn_to_phys(_pgd_pfn(*pgdp))); + p4dp = base_p4d + p4d_index(vaddr); + } + + do { + next = p4d_addr_end(vaddr, end); + + if (pgtable_l4_enabled && IS_ALIGNED(vaddr, P4D_SIZE) && + (next - vaddr) >= P4D_SIZE) { + p4d_clear(p4dp); + continue; + } + + kasan_early_clear_pud(p4dp, vaddr, next); + } while (p4dp++, vaddr = next, vaddr != end); +} + +static void __init kasan_early_clear_pgd(pgd_t *pgdp, + unsigned long vaddr, unsigned long end) +{ + unsigned long next; + + do { + next = pgd_addr_end(vaddr, end); + + if (pgtable_l5_enabled && IS_ALIGNED(vaddr, PGDIR_SIZE) && + (next - vaddr) >= PGDIR_SIZE) { + pgd_clear(pgdp); + continue; + } + + kasan_early_clear_p4d(pgdp, vaddr, next); + } while (pgdp++, vaddr = next, vaddr != end); +} + +static void __init kasan_early_populate_pud(p4d_t *p4dp, + unsigned long vaddr, + unsigned long end) +{ + pud_t *pudp, *base_pud; + phys_addr_t phys_addr; + unsigned long next; + + if (!pgtable_l4_enabled) { + pudp = (pud_t *)p4dp; + } else { + base_pud = pt_ops.get_pud_virt(pfn_to_phys(_p4d_pfn(*p4dp))); + pudp = base_pud + pud_index(vaddr); + } + + do { + next = pud_addr_end(vaddr, end); + + if (pud_none(*pudp) && IS_ALIGNED(vaddr, PUD_SIZE) && + (next - vaddr) >= PUD_SIZE) { + phys_addr = __pa((uintptr_t)kasan_early_shadow_pmd); + set_pud(pudp, pfn_pud(PFN_DOWN(phys_addr), PAGE_TABLE)); + continue; + } + + BUG(); + } while (pudp++, vaddr = next, vaddr != end); +} + +static void __init kasan_early_populate_p4d(pgd_t *pgdp, + unsigned long vaddr, + unsigned long end) +{ + p4d_t *p4dp, *base_p4d; + phys_addr_t phys_addr; + unsigned long next; + + /* + * We can't use pgd_page_vaddr here as it would return a linear + * mapping address but it is not mapped yet, but when populating + * early_pg_dir, we need the physical address and when populating + * swapper_pg_dir, we need the kernel virtual address so use + * pt_ops facility. + * Note that this test is then completely equivalent to + * p4dp = p4d_offset(pgdp, vaddr) + */ + if (!pgtable_l5_enabled) { + p4dp = (p4d_t *)pgdp; + } else { + base_p4d = pt_ops.get_p4d_virt(pfn_to_phys(_pgd_pfn(*pgdp))); + p4dp = base_p4d + p4d_index(vaddr); + } + + do { + next = p4d_addr_end(vaddr, end); + + if (p4d_none(*p4dp) && IS_ALIGNED(vaddr, P4D_SIZE) && + (next - vaddr) >= P4D_SIZE) { + phys_addr = __pa((uintptr_t)kasan_early_shadow_pud); + set_p4d(p4dp, pfn_p4d(PFN_DOWN(phys_addr), PAGE_TABLE)); + continue; + } + + kasan_early_populate_pud(p4dp, vaddr, next); + } while (p4dp++, vaddr = next, vaddr != end); +} + +static void __init kasan_early_populate_pgd(pgd_t *pgdp, + unsigned long vaddr, + unsigned long end) +{ + phys_addr_t phys_addr; + unsigned long next; + + do { + next = pgd_addr_end(vaddr, end); + + if (pgd_none(*pgdp) && IS_ALIGNED(vaddr, PGDIR_SIZE) && + (next - vaddr) >= PGDIR_SIZE) { + phys_addr = __pa((uintptr_t)kasan_early_shadow_p4d); + set_pgd(pgdp, pfn_pgd(PFN_DOWN(phys_addr), PAGE_TABLE)); + continue; + } + + kasan_early_populate_p4d(pgdp, vaddr, next); + } while (pgdp++, vaddr = next, vaddr != end); +} + +asmlinkage void __init kasan_early_init(void) +{ + uintptr_t i; + + BUILD_BUG_ON(KASAN_SHADOW_OFFSET != + KASAN_SHADOW_END - (1UL << (64 - KASAN_SHADOW_SCALE_SHIFT))); + + for (i = 0; i < PTRS_PER_PTE; ++i) + set_pte(kasan_early_shadow_pte + i, + pfn_pte(virt_to_pfn(kasan_early_shadow_page), PAGE_KERNEL)); + + for (i = 0; i < PTRS_PER_PMD; ++i) + set_pmd(kasan_early_shadow_pmd + i, + pfn_pmd(PFN_DOWN + (__pa((uintptr_t)kasan_early_shadow_pte)), + PAGE_TABLE)); + + if (pgtable_l4_enabled) { + for (i = 0; i < PTRS_PER_PUD; ++i) + set_pud(kasan_early_shadow_pud + i, + pfn_pud(PFN_DOWN + (__pa(((uintptr_t)kasan_early_shadow_pmd))), + PAGE_TABLE)); + } + + if (pgtable_l5_enabled) { + for (i = 0; i < PTRS_PER_P4D; ++i) + set_p4d(kasan_early_shadow_p4d + i, + pfn_p4d(PFN_DOWN + (__pa(((uintptr_t)kasan_early_shadow_pud))), + PAGE_TABLE)); + } + + kasan_early_populate_pgd(early_pg_dir + pgd_index(KASAN_SHADOW_START), + KASAN_SHADOW_START, KASAN_SHADOW_END); + + local_flush_tlb_all(); +} + +void __init kasan_swapper_init(void) +{ + kasan_early_populate_pgd(pgd_offset_k(KASAN_SHADOW_START), + KASAN_SHADOW_START, KASAN_SHADOW_END); + + local_flush_tlb_all(); +} + +static void __init kasan_populate(void *start, void *end) +{ + unsigned long vaddr = (unsigned long)start & PAGE_MASK; + unsigned long vend = PAGE_ALIGN((unsigned long)end); + + kasan_populate_pgd(pgd_offset_k(vaddr), vaddr, vend); +} + +static void __init kasan_shallow_populate_pud(p4d_t *p4d, + unsigned long vaddr, unsigned long end) +{ + unsigned long next; + void *p; + pud_t *pud_k = pud_offset(p4d, vaddr); + + do { + next = pud_addr_end(vaddr, end); + + if (pud_none(*pud_k)) { + p = memblock_alloc(PAGE_SIZE, PAGE_SIZE); + set_pud(pud_k, pfn_pud(PFN_DOWN(__pa(p)), PAGE_TABLE)); + continue; + } + + BUG(); + } while (pud_k++, vaddr = next, vaddr != end); +} + +static void __init kasan_shallow_populate_p4d(pgd_t *pgd, + unsigned long vaddr, unsigned long end) +{ + unsigned long next; + void *p; + p4d_t *p4d_k = p4d_offset(pgd, vaddr); + + do { + next = p4d_addr_end(vaddr, end); + + if (p4d_none(*p4d_k)) { + p = memblock_alloc(PAGE_SIZE, PAGE_SIZE); + set_p4d(p4d_k, pfn_p4d(PFN_DOWN(__pa(p)), PAGE_TABLE)); + continue; + } + + kasan_shallow_populate_pud(p4d_k, vaddr, end); + } while (p4d_k++, vaddr = next, vaddr != end); +} + +static void __init kasan_shallow_populate_pgd(unsigned long vaddr, unsigned long end) +{ + unsigned long next; + void *p; + pgd_t *pgd_k = pgd_offset_k(vaddr); + + do { + next = pgd_addr_end(vaddr, end); + + if (pgd_none(*pgd_k)) { + p = memblock_alloc(PAGE_SIZE, PAGE_SIZE); + set_pgd(pgd_k, pfn_pgd(PFN_DOWN(__pa(p)), PAGE_TABLE)); + continue; + } + + kasan_shallow_populate_p4d(pgd_k, vaddr, next); + } while (pgd_k++, vaddr = next, vaddr != end); +} + +static void __init kasan_shallow_populate(void *start, void *end) +{ + unsigned long vaddr = (unsigned long)start & PAGE_MASK; + unsigned long vend = PAGE_ALIGN((unsigned long)end); + + kasan_shallow_populate_pgd(vaddr, vend); +} + +static void __init create_tmp_mapping(void) +{ + void *ptr; + p4d_t *base_p4d; + + /* + * We need to clean the early mapping: this is hard to achieve "in-place", + * so install a temporary mapping like arm64 and x86 do. + */ + memcpy(tmp_pg_dir, swapper_pg_dir, sizeof(pgd_t) * PTRS_PER_PGD); + + /* Copy the last p4d since it is shared with the kernel mapping. */ + if (pgtable_l5_enabled) { + ptr = (p4d_t *)pgd_page_vaddr(*pgd_offset_k(KASAN_SHADOW_END)); + memcpy(tmp_p4d, ptr, sizeof(p4d_t) * PTRS_PER_P4D); + set_pgd(&tmp_pg_dir[pgd_index(KASAN_SHADOW_END)], + pfn_pgd(PFN_DOWN(__pa(tmp_p4d)), PAGE_TABLE)); + base_p4d = tmp_p4d; + } else { + base_p4d = (p4d_t *)tmp_pg_dir; + } + + /* Copy the last pud since it is shared with the kernel mapping. */ + if (pgtable_l4_enabled) { + ptr = (pud_t *)p4d_page_vaddr(*(base_p4d + p4d_index(KASAN_SHADOW_END))); + memcpy(tmp_pud, ptr, sizeof(pud_t) * PTRS_PER_PUD); + set_p4d(&base_p4d[p4d_index(KASAN_SHADOW_END)], + pfn_p4d(PFN_DOWN(__pa(tmp_pud)), PAGE_TABLE)); + } +} + +void __init kasan_init(void) +{ + phys_addr_t p_start, p_end; + u64 i; + + create_tmp_mapping(); + csr_write(CSR_SATP, PFN_DOWN(__pa(tmp_pg_dir)) | satp_mode); + + kasan_early_clear_pgd(pgd_offset_k(KASAN_SHADOW_START), + KASAN_SHADOW_START, KASAN_SHADOW_END); + + kasan_populate_early_shadow((void *)kasan_mem_to_shadow((void *)FIXADDR_START), + (void *)kasan_mem_to_shadow((void *)VMALLOC_START)); + + if (IS_ENABLED(CONFIG_KASAN_VMALLOC)) { + kasan_shallow_populate( + (void *)kasan_mem_to_shadow((void *)VMALLOC_START), + (void *)kasan_mem_to_shadow((void *)VMALLOC_END)); + /* Shallow populate modules and BPF which are vmalloc-allocated */ + kasan_shallow_populate( + (void *)kasan_mem_to_shadow((void *)MODULES_VADDR), + (void *)kasan_mem_to_shadow((void *)MODULES_END)); + } else { + kasan_populate_early_shadow((void *)kasan_mem_to_shadow((void *)VMALLOC_START), + (void *)kasan_mem_to_shadow((void *)VMALLOC_END)); + } + + /* Populate the linear mapping */ + for_each_mem_range(i, &p_start, &p_end) { + void *start = (void *)__va(p_start); + void *end = (void *)__va(p_end); + + if (start >= end) + break; + + kasan_populate(kasan_mem_to_shadow(start), kasan_mem_to_shadow(end)); + } + + /* Populate kernel */ + kasan_populate(kasan_mem_to_shadow((const void *)MODULES_END), + kasan_mem_to_shadow((const void *)MODULES_VADDR + SZ_2G)); + + for (i = 0; i < PTRS_PER_PTE; i++) + set_pte(&kasan_early_shadow_pte[i], + mk_pte(virt_to_page(kasan_early_shadow_page), + __pgprot(_PAGE_PRESENT | _PAGE_READ | + _PAGE_ACCESSED))); + + memset(kasan_early_shadow_page, KASAN_SHADOW_INIT, PAGE_SIZE); + init_task.kasan_depth = 0; + + csr_write(CSR_SATP, PFN_DOWN(__pa(swapper_pg_dir)) | satp_mode); + local_flush_tlb_all(); +} diff --git a/arch/riscv/mm/pageattr.c b/arch/riscv/mm/pageattr.c new file mode 100644 index 0000000000..01398fee5c --- /dev/null +++ b/arch/riscv/mm/pageattr.c @@ -0,0 +1,439 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2019 SiFive + */ + +#include <linux/pagewalk.h> +#include <linux/pgtable.h> +#include <linux/vmalloc.h> +#include <asm/tlbflush.h> +#include <asm/bitops.h> +#include <asm/set_memory.h> + +struct pageattr_masks { + pgprot_t set_mask; + pgprot_t clear_mask; +}; + +static unsigned long set_pageattr_masks(unsigned long val, struct mm_walk *walk) +{ + struct pageattr_masks *masks = walk->private; + unsigned long new_val = val; + + new_val &= ~(pgprot_val(masks->clear_mask)); + new_val |= (pgprot_val(masks->set_mask)); + + return new_val; +} + +static int pageattr_p4d_entry(p4d_t *p4d, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + p4d_t val = READ_ONCE(*p4d); + + if (p4d_leaf(val)) { + val = __p4d(set_pageattr_masks(p4d_val(val), walk)); + set_p4d(p4d, val); + } + + return 0; +} + +static int pageattr_pud_entry(pud_t *pud, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + pud_t val = READ_ONCE(*pud); + + if (pud_leaf(val)) { + val = __pud(set_pageattr_masks(pud_val(val), walk)); + set_pud(pud, val); + } + + return 0; +} + +static int pageattr_pmd_entry(pmd_t *pmd, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + pmd_t val = READ_ONCE(*pmd); + + if (pmd_leaf(val)) { + val = __pmd(set_pageattr_masks(pmd_val(val), walk)); + set_pmd(pmd, val); + } + + return 0; +} + +static int pageattr_pte_entry(pte_t *pte, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + pte_t val = READ_ONCE(*pte); + + val = __pte(set_pageattr_masks(pte_val(val), walk)); + set_pte(pte, val); + + return 0; +} + +static int pageattr_pte_hole(unsigned long addr, unsigned long next, + int depth, struct mm_walk *walk) +{ + /* Nothing to do here */ + return 0; +} + +static const struct mm_walk_ops pageattr_ops = { + .p4d_entry = pageattr_p4d_entry, + .pud_entry = pageattr_pud_entry, + .pmd_entry = pageattr_pmd_entry, + .pte_entry = pageattr_pte_entry, + .pte_hole = pageattr_pte_hole, + .walk_lock = PGWALK_RDLOCK, +}; + +#ifdef CONFIG_64BIT +static int __split_linear_mapping_pmd(pud_t *pudp, + unsigned long vaddr, unsigned long end) +{ + pmd_t *pmdp; + unsigned long next; + + pmdp = pmd_offset(pudp, vaddr); + + do { + next = pmd_addr_end(vaddr, end); + + if (next - vaddr >= PMD_SIZE && + vaddr <= (vaddr & PMD_MASK) && end >= next) + continue; + + if (pmd_leaf(*pmdp)) { + struct page *pte_page; + unsigned long pfn = _pmd_pfn(*pmdp); + pgprot_t prot = __pgprot(pmd_val(*pmdp) & ~_PAGE_PFN_MASK); + pte_t *ptep_new; + int i; + + pte_page = alloc_page(GFP_KERNEL); + if (!pte_page) + return -ENOMEM; + + ptep_new = (pte_t *)page_address(pte_page); + for (i = 0; i < PTRS_PER_PTE; ++i, ++ptep_new) + set_pte(ptep_new, pfn_pte(pfn + i, prot)); + + smp_wmb(); + + set_pmd(pmdp, pfn_pmd(page_to_pfn(pte_page), PAGE_TABLE)); + } + } while (pmdp++, vaddr = next, vaddr != end); + + return 0; +} + +static int __split_linear_mapping_pud(p4d_t *p4dp, + unsigned long vaddr, unsigned long end) +{ + pud_t *pudp; + unsigned long next; + int ret; + + pudp = pud_offset(p4dp, vaddr); + + do { + next = pud_addr_end(vaddr, end); + + if (next - vaddr >= PUD_SIZE && + vaddr <= (vaddr & PUD_MASK) && end >= next) + continue; + + if (pud_leaf(*pudp)) { + struct page *pmd_page; + unsigned long pfn = _pud_pfn(*pudp); + pgprot_t prot = __pgprot(pud_val(*pudp) & ~_PAGE_PFN_MASK); + pmd_t *pmdp_new; + int i; + + pmd_page = alloc_page(GFP_KERNEL); + if (!pmd_page) + return -ENOMEM; + + pmdp_new = (pmd_t *)page_address(pmd_page); + for (i = 0; i < PTRS_PER_PMD; ++i, ++pmdp_new) + set_pmd(pmdp_new, + pfn_pmd(pfn + ((i * PMD_SIZE) >> PAGE_SHIFT), prot)); + + smp_wmb(); + + set_pud(pudp, pfn_pud(page_to_pfn(pmd_page), PAGE_TABLE)); + } + + ret = __split_linear_mapping_pmd(pudp, vaddr, next); + if (ret) + return ret; + } while (pudp++, vaddr = next, vaddr != end); + + return 0; +} + +static int __split_linear_mapping_p4d(pgd_t *pgdp, + unsigned long vaddr, unsigned long end) +{ + p4d_t *p4dp; + unsigned long next; + int ret; + + p4dp = p4d_offset(pgdp, vaddr); + + do { + next = p4d_addr_end(vaddr, end); + + /* + * If [vaddr; end] contains [vaddr & P4D_MASK; next], we don't + * need to split, we'll change the protections on the whole P4D. + */ + if (next - vaddr >= P4D_SIZE && + vaddr <= (vaddr & P4D_MASK) && end >= next) + continue; + + if (p4d_leaf(*p4dp)) { + struct page *pud_page; + unsigned long pfn = _p4d_pfn(*p4dp); + pgprot_t prot = __pgprot(p4d_val(*p4dp) & ~_PAGE_PFN_MASK); + pud_t *pudp_new; + int i; + + pud_page = alloc_page(GFP_KERNEL); + if (!pud_page) + return -ENOMEM; + + /* + * Fill the pud level with leaf puds that have the same + * protections as the leaf p4d. + */ + pudp_new = (pud_t *)page_address(pud_page); + for (i = 0; i < PTRS_PER_PUD; ++i, ++pudp_new) + set_pud(pudp_new, + pfn_pud(pfn + ((i * PUD_SIZE) >> PAGE_SHIFT), prot)); + + /* + * Make sure the pud filling is not reordered with the + * p4d store which could result in seeing a partially + * filled pud level. + */ + smp_wmb(); + + set_p4d(p4dp, pfn_p4d(page_to_pfn(pud_page), PAGE_TABLE)); + } + + ret = __split_linear_mapping_pud(p4dp, vaddr, next); + if (ret) + return ret; + } while (p4dp++, vaddr = next, vaddr != end); + + return 0; +} + +static int __split_linear_mapping_pgd(pgd_t *pgdp, + unsigned long vaddr, + unsigned long end) +{ + unsigned long next; + int ret; + + do { + next = pgd_addr_end(vaddr, end); + /* We never use PGD mappings for the linear mapping */ + ret = __split_linear_mapping_p4d(pgdp, vaddr, next); + if (ret) + return ret; + } while (pgdp++, vaddr = next, vaddr != end); + + return 0; +} + +static int split_linear_mapping(unsigned long start, unsigned long end) +{ + return __split_linear_mapping_pgd(pgd_offset_k(start), start, end); +} +#endif /* CONFIG_64BIT */ + +static int __set_memory(unsigned long addr, int numpages, pgprot_t set_mask, + pgprot_t clear_mask) +{ + int ret; + unsigned long start = addr; + unsigned long end = start + PAGE_SIZE * numpages; + unsigned long __maybe_unused lm_start; + unsigned long __maybe_unused lm_end; + struct pageattr_masks masks = { + .set_mask = set_mask, + .clear_mask = clear_mask + }; + + if (!numpages) + return 0; + + mmap_write_lock(&init_mm); + +#ifdef CONFIG_64BIT + /* + * We are about to change the permissions of a kernel mapping, we must + * apply the same changes to its linear mapping alias, which may imply + * splitting a huge mapping. + */ + + if (is_vmalloc_or_module_addr((void *)start)) { + struct vm_struct *area = NULL; + int i, page_start; + + area = find_vm_area((void *)start); + page_start = (start - (unsigned long)area->addr) >> PAGE_SHIFT; + + for (i = page_start; i < page_start + numpages; ++i) { + lm_start = (unsigned long)page_address(area->pages[i]); + lm_end = lm_start + PAGE_SIZE; + + ret = split_linear_mapping(lm_start, lm_end); + if (ret) + goto unlock; + + ret = walk_page_range_novma(&init_mm, lm_start, lm_end, + &pageattr_ops, NULL, &masks); + if (ret) + goto unlock; + } + } else if (is_kernel_mapping(start) || is_linear_mapping(start)) { + if (is_kernel_mapping(start)) { + lm_start = (unsigned long)lm_alias(start); + lm_end = (unsigned long)lm_alias(end); + } else { + lm_start = start; + lm_end = end; + } + + ret = split_linear_mapping(lm_start, lm_end); + if (ret) + goto unlock; + + ret = walk_page_range_novma(&init_mm, lm_start, lm_end, + &pageattr_ops, NULL, &masks); + if (ret) + goto unlock; + } + + ret = walk_page_range_novma(&init_mm, start, end, &pageattr_ops, NULL, + &masks); + +unlock: + mmap_write_unlock(&init_mm); + + /* + * We can't use flush_tlb_kernel_range() here as we may have split a + * hugepage that is larger than that, so let's flush everything. + */ + flush_tlb_all(); +#else + ret = walk_page_range_novma(&init_mm, start, end, &pageattr_ops, NULL, + &masks); + + mmap_write_unlock(&init_mm); + + flush_tlb_kernel_range(start, end); +#endif + + return ret; +} + +int set_memory_rw_nx(unsigned long addr, int numpages) +{ + return __set_memory(addr, numpages, __pgprot(_PAGE_READ | _PAGE_WRITE), + __pgprot(_PAGE_EXEC)); +} + +int set_memory_ro(unsigned long addr, int numpages) +{ + return __set_memory(addr, numpages, __pgprot(_PAGE_READ), + __pgprot(_PAGE_WRITE)); +} + +int set_memory_rw(unsigned long addr, int numpages) +{ + return __set_memory(addr, numpages, __pgprot(_PAGE_READ | _PAGE_WRITE), + __pgprot(0)); +} + +int set_memory_x(unsigned long addr, int numpages) +{ + return __set_memory(addr, numpages, __pgprot(_PAGE_EXEC), __pgprot(0)); +} + +int set_memory_nx(unsigned long addr, int numpages) +{ + return __set_memory(addr, numpages, __pgprot(0), __pgprot(_PAGE_EXEC)); +} + +int set_direct_map_invalid_noflush(struct page *page) +{ + return __set_memory((unsigned long)page_address(page), 1, + __pgprot(0), __pgprot(_PAGE_PRESENT)); +} + +int set_direct_map_default_noflush(struct page *page) +{ + return __set_memory((unsigned long)page_address(page), 1, + PAGE_KERNEL, __pgprot(_PAGE_EXEC)); +} + +#ifdef CONFIG_DEBUG_PAGEALLOC +void __kernel_map_pages(struct page *page, int numpages, int enable) +{ + if (!debug_pagealloc_enabled()) + return; + + if (enable) + __set_memory((unsigned long)page_address(page), numpages, + __pgprot(_PAGE_PRESENT), __pgprot(0)); + else + __set_memory((unsigned long)page_address(page), numpages, + __pgprot(0), __pgprot(_PAGE_PRESENT)); +} +#endif + +bool kernel_page_present(struct page *page) +{ + unsigned long addr = (unsigned long)page_address(page); + pgd_t *pgd; + pud_t *pud; + p4d_t *p4d; + pmd_t *pmd; + pte_t *pte; + + pgd = pgd_offset_k(addr); + if (!pgd_present(*pgd)) + return false; + if (pgd_leaf(*pgd)) + return true; + + p4d = p4d_offset(pgd, addr); + if (!p4d_present(*p4d)) + return false; + if (p4d_leaf(*p4d)) + return true; + + pud = pud_offset(p4d, addr); + if (!pud_present(*pud)) + return false; + if (pud_leaf(*pud)) + return true; + + pmd = pmd_offset(pud, addr); + if (!pmd_present(*pmd)) + return false; + if (pmd_leaf(*pmd)) + return true; + + pte = pte_offset_kernel(pmd, addr); + return pte_present(*pte); +} diff --git a/arch/riscv/mm/pgtable.c b/arch/riscv/mm/pgtable.c new file mode 100644 index 0000000000..fef4e7328e --- /dev/null +++ b/arch/riscv/mm/pgtable.c @@ -0,0 +1,103 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <asm/pgalloc.h> +#include <linux/gfp.h> +#include <linux/kernel.h> +#include <linux/pgtable.h> + +#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP +int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot) +{ + return 0; +} + +void p4d_clear_huge(p4d_t *p4d) +{ +} + +int pud_set_huge(pud_t *pud, phys_addr_t phys, pgprot_t prot) +{ + pud_t new_pud = pfn_pud(__phys_to_pfn(phys), prot); + + set_pud(pud, new_pud); + return 1; +} + +int pud_clear_huge(pud_t *pud) +{ + if (!pud_leaf(READ_ONCE(*pud))) + return 0; + pud_clear(pud); + return 1; +} + +int pud_free_pmd_page(pud_t *pud, unsigned long addr) +{ + pmd_t *pmd = pud_pgtable(*pud); + int i; + + pud_clear(pud); + + flush_tlb_kernel_range(addr, addr + PUD_SIZE); + + for (i = 0; i < PTRS_PER_PMD; i++) { + if (!pmd_none(pmd[i])) { + pte_t *pte = (pte_t *)pmd_page_vaddr(pmd[i]); + + pte_free_kernel(NULL, pte); + } + } + + pmd_free(NULL, pmd); + + return 1; +} + +int pmd_set_huge(pmd_t *pmd, phys_addr_t phys, pgprot_t prot) +{ + pmd_t new_pmd = pfn_pmd(__phys_to_pfn(phys), prot); + + set_pmd(pmd, new_pmd); + return 1; +} + +int pmd_clear_huge(pmd_t *pmd) +{ + if (!pmd_leaf(READ_ONCE(*pmd))) + return 0; + pmd_clear(pmd); + return 1; +} + +int pmd_free_pte_page(pmd_t *pmd, unsigned long addr) +{ + pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd); + + pmd_clear(pmd); + + flush_tlb_kernel_range(addr, addr + PMD_SIZE); + pte_free_kernel(NULL, pte); + return 1; +} + +#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp) +{ + pmd_t pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp); + + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + VM_BUG_ON(pmd_trans_huge(*pmdp)); + /* + * When leaf PTE entries (regular pages) are collapsed into a leaf + * PMD entry (huge page), a valid non-leaf PTE is converted into a + * valid leaf PTE at the level 1 page table. Since the sfence.vma + * forms that specify an address only apply to leaf PTEs, we need a + * global flush here. collapse_huge_page() assumes these flushes are + * eager, so just do the fence here. + */ + flush_tlb_mm(vma->vm_mm); + return pmd; +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ diff --git a/arch/riscv/mm/physaddr.c b/arch/riscv/mm/physaddr.c new file mode 100644 index 0000000000..18706f457d --- /dev/null +++ b/arch/riscv/mm/physaddr.c @@ -0,0 +1,51 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/types.h> +#include <linux/mmdebug.h> +#include <linux/mm.h> +#include <asm/page.h> +#include <asm/sections.h> + +phys_addr_t __virt_to_phys(unsigned long x) +{ + /* + * Boundary checking aginst the kernel linear mapping space. + */ + WARN(!is_linear_mapping(x) && !is_kernel_mapping(x), + "virt_to_phys used for non-linear address: %pK (%pS)\n", + (void *)x, (void *)x); + + return __va_to_pa_nodebug(x); +} +EXPORT_SYMBOL(__virt_to_phys); + +phys_addr_t __phys_addr_symbol(unsigned long x) +{ + unsigned long kernel_start = kernel_map.virt_addr; + unsigned long kernel_end = kernel_start + kernel_map.size; + + /* + * Boundary checking aginst the kernel image mapping. + * __pa_symbol should only be used on kernel symbol addresses. + */ + VIRTUAL_BUG_ON(x < kernel_start || x > kernel_end); + + return __va_to_pa_nodebug(x); +} +EXPORT_SYMBOL(__phys_addr_symbol); + +phys_addr_t linear_mapping_va_to_pa(unsigned long x) +{ + BUG_ON(!kernel_map.va_pa_offset); + + return ((unsigned long)(x) - kernel_map.va_pa_offset); +} +EXPORT_SYMBOL(linear_mapping_va_to_pa); + +void *linear_mapping_pa_to_va(unsigned long x) +{ + BUG_ON(!kernel_map.va_pa_offset); + + return ((void *)((unsigned long)(x) + kernel_map.va_pa_offset)); +} +EXPORT_SYMBOL(linear_mapping_pa_to_va); diff --git a/arch/riscv/mm/pmem.c b/arch/riscv/mm/pmem.c new file mode 100644 index 0000000000..c5fc5ec96f --- /dev/null +++ b/arch/riscv/mm/pmem.c @@ -0,0 +1,34 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2022 Ventana Micro Systems Inc. + */ + +#include <linux/export.h> +#include <linux/libnvdimm.h> + +#include <asm/cacheflush.h> +#include <asm/dma-noncoherent.h> + +void arch_wb_cache_pmem(void *addr, size_t size) +{ +#ifdef CONFIG_RISCV_NONSTANDARD_CACHE_OPS + if (unlikely(noncoherent_cache_ops.wback)) { + noncoherent_cache_ops.wback(virt_to_phys(addr), size); + return; + } +#endif + ALT_CMO_OP(clean, addr, size, riscv_cbom_block_size); +} +EXPORT_SYMBOL_GPL(arch_wb_cache_pmem); + +void arch_invalidate_pmem(void *addr, size_t size) +{ +#ifdef CONFIG_RISCV_NONSTANDARD_CACHE_OPS + if (unlikely(noncoherent_cache_ops.inv)) { + noncoherent_cache_ops.inv(virt_to_phys(addr), size); + return; + } +#endif + ALT_CMO_OP(inval, addr, size, riscv_cbom_block_size); +} +EXPORT_SYMBOL_GPL(arch_invalidate_pmem); diff --git a/arch/riscv/mm/ptdump.c b/arch/riscv/mm/ptdump.c new file mode 100644 index 0000000000..e9090b38f8 --- /dev/null +++ b/arch/riscv/mm/ptdump.c @@ -0,0 +1,405 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2019 SiFive + */ + +#include <linux/efi.h> +#include <linux/init.h> +#include <linux/debugfs.h> +#include <linux/seq_file.h> +#include <linux/ptdump.h> + +#include <asm/ptdump.h> +#include <linux/pgtable.h> +#include <asm/kasan.h> + +#define pt_dump_seq_printf(m, fmt, args...) \ +({ \ + if (m) \ + seq_printf(m, fmt, ##args); \ +}) + +#define pt_dump_seq_puts(m, fmt) \ +({ \ + if (m) \ + seq_printf(m, fmt); \ +}) + +/* + * The page dumper groups page table entries of the same type into a single + * description. It uses pg_state to track the range information while + * iterating over the pte entries. When the continuity is broken it then + * dumps out a description of the range. + */ +struct pg_state { + struct ptdump_state ptdump; + struct seq_file *seq; + const struct addr_marker *marker; + unsigned long start_address; + unsigned long start_pa; + unsigned long last_pa; + int level; + u64 current_prot; + bool check_wx; + unsigned long wx_pages; +}; + +/* Address marker */ +struct addr_marker { + unsigned long start_address; + const char *name; +}; + +/* Private information for debugfs */ +struct ptd_mm_info { + struct mm_struct *mm; + const struct addr_marker *markers; + unsigned long base_addr; + unsigned long end; +}; + +enum address_markers_idx { + FIXMAP_START_NR, + FIXMAP_END_NR, + PCI_IO_START_NR, + PCI_IO_END_NR, +#ifdef CONFIG_SPARSEMEM_VMEMMAP + VMEMMAP_START_NR, + VMEMMAP_END_NR, +#endif + VMALLOC_START_NR, + VMALLOC_END_NR, + PAGE_OFFSET_NR, +#ifdef CONFIG_KASAN + KASAN_SHADOW_START_NR, + KASAN_SHADOW_END_NR, +#endif +#ifdef CONFIG_64BIT + MODULES_MAPPING_NR, + KERNEL_MAPPING_NR, +#endif + END_OF_SPACE_NR +}; + +static struct addr_marker address_markers[] = { + {0, "Fixmap start"}, + {0, "Fixmap end"}, + {0, "PCI I/O start"}, + {0, "PCI I/O end"}, +#ifdef CONFIG_SPARSEMEM_VMEMMAP + {0, "vmemmap start"}, + {0, "vmemmap end"}, +#endif + {0, "vmalloc() area"}, + {0, "vmalloc() end"}, + {0, "Linear mapping"}, +#ifdef CONFIG_KASAN + {0, "Kasan shadow start"}, + {0, "Kasan shadow end"}, +#endif +#ifdef CONFIG_64BIT + {0, "Modules/BPF mapping"}, + {0, "Kernel mapping"}, +#endif + {-1, NULL}, +}; + +static struct ptd_mm_info kernel_ptd_info = { + .mm = &init_mm, + .markers = address_markers, + .base_addr = 0, + .end = ULONG_MAX, +}; + +#ifdef CONFIG_EFI +static struct addr_marker efi_addr_markers[] = { + { 0, "UEFI runtime start" }, + { SZ_1G, "UEFI runtime end" }, + { -1, NULL } +}; + +static struct ptd_mm_info efi_ptd_info = { + .mm = &efi_mm, + .markers = efi_addr_markers, + .base_addr = 0, + .end = SZ_2G, +}; +#endif + +/* Page Table Entry */ +struct prot_bits { + u64 mask; + u64 val; + const char *set; + const char *clear; +}; + +static const struct prot_bits pte_bits[] = { + { + .mask = _PAGE_SOFT, + .val = _PAGE_SOFT, + .set = "RSW", + .clear = " ", + }, { + .mask = _PAGE_DIRTY, + .val = _PAGE_DIRTY, + .set = "D", + .clear = ".", + }, { + .mask = _PAGE_ACCESSED, + .val = _PAGE_ACCESSED, + .set = "A", + .clear = ".", + }, { + .mask = _PAGE_GLOBAL, + .val = _PAGE_GLOBAL, + .set = "G", + .clear = ".", + }, { + .mask = _PAGE_USER, + .val = _PAGE_USER, + .set = "U", + .clear = ".", + }, { + .mask = _PAGE_EXEC, + .val = _PAGE_EXEC, + .set = "X", + .clear = ".", + }, { + .mask = _PAGE_WRITE, + .val = _PAGE_WRITE, + .set = "W", + .clear = ".", + }, { + .mask = _PAGE_READ, + .val = _PAGE_READ, + .set = "R", + .clear = ".", + }, { + .mask = _PAGE_PRESENT, + .val = _PAGE_PRESENT, + .set = "V", + .clear = ".", + } +}; + +/* Page Level */ +struct pg_level { + const char *name; + u64 mask; +}; + +static struct pg_level pg_level[] = { + { /* pgd */ + .name = "PGD", + }, { /* p4d */ + .name = (CONFIG_PGTABLE_LEVELS > 4) ? "P4D" : "PGD", + }, { /* pud */ + .name = (CONFIG_PGTABLE_LEVELS > 3) ? "PUD" : "PGD", + }, { /* pmd */ + .name = (CONFIG_PGTABLE_LEVELS > 2) ? "PMD" : "PGD", + }, { /* pte */ + .name = "PTE", + }, +}; + +static void dump_prot(struct pg_state *st) +{ + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(pte_bits); i++) { + const char *s; + + if ((st->current_prot & pte_bits[i].mask) == pte_bits[i].val) + s = pte_bits[i].set; + else + s = pte_bits[i].clear; + + if (s) + pt_dump_seq_printf(st->seq, " %s", s); + } +} + +#ifdef CONFIG_64BIT +#define ADDR_FORMAT "0x%016lx" +#else +#define ADDR_FORMAT "0x%08lx" +#endif +static void dump_addr(struct pg_state *st, unsigned long addr) +{ + static const char units[] = "KMGTPE"; + const char *unit = units; + unsigned long delta; + + pt_dump_seq_printf(st->seq, ADDR_FORMAT "-" ADDR_FORMAT " ", + st->start_address, addr); + + pt_dump_seq_printf(st->seq, " " ADDR_FORMAT " ", st->start_pa); + delta = (addr - st->start_address) >> 10; + + while (!(delta & 1023) && unit[1]) { + delta >>= 10; + unit++; + } + + pt_dump_seq_printf(st->seq, "%9lu%c %s", delta, *unit, + pg_level[st->level].name); +} + +static void note_prot_wx(struct pg_state *st, unsigned long addr) +{ + if (!st->check_wx) + return; + + if ((st->current_prot & (_PAGE_WRITE | _PAGE_EXEC)) != + (_PAGE_WRITE | _PAGE_EXEC)) + return; + + WARN_ONCE(1, "riscv/mm: Found insecure W+X mapping at address %p/%pS\n", + (void *)st->start_address, (void *)st->start_address); + + st->wx_pages += (addr - st->start_address) / PAGE_SIZE; +} + +static void note_page(struct ptdump_state *pt_st, unsigned long addr, + int level, u64 val) +{ + struct pg_state *st = container_of(pt_st, struct pg_state, ptdump); + u64 pa = PFN_PHYS(pte_pfn(__pte(val))); + u64 prot = 0; + + if (level >= 0) + prot = val & pg_level[level].mask; + + if (st->level == -1) { + st->level = level; + st->current_prot = prot; + st->start_address = addr; + st->start_pa = pa; + st->last_pa = pa; + pt_dump_seq_printf(st->seq, "---[ %s ]---\n", st->marker->name); + } else if (prot != st->current_prot || + level != st->level || addr >= st->marker[1].start_address) { + if (st->current_prot) { + note_prot_wx(st, addr); + dump_addr(st, addr); + dump_prot(st); + pt_dump_seq_puts(st->seq, "\n"); + } + + while (addr >= st->marker[1].start_address) { + st->marker++; + pt_dump_seq_printf(st->seq, "---[ %s ]---\n", + st->marker->name); + } + + st->start_address = addr; + st->start_pa = pa; + st->last_pa = pa; + st->current_prot = prot; + st->level = level; + } else { + st->last_pa = pa; + } +} + +static void ptdump_walk(struct seq_file *s, struct ptd_mm_info *pinfo) +{ + struct pg_state st = { + .seq = s, + .marker = pinfo->markers, + .level = -1, + .ptdump = { + .note_page = note_page, + .range = (struct ptdump_range[]) { + {pinfo->base_addr, pinfo->end}, + {0, 0} + } + } + }; + + ptdump_walk_pgd(&st.ptdump, pinfo->mm, NULL); +} + +void ptdump_check_wx(void) +{ + struct pg_state st = { + .seq = NULL, + .marker = (struct addr_marker[]) { + {0, NULL}, + {-1, NULL}, + }, + .level = -1, + .check_wx = true, + .ptdump = { + .note_page = note_page, + .range = (struct ptdump_range[]) { + {KERN_VIRT_START, ULONG_MAX}, + {0, 0} + } + } + }; + + ptdump_walk_pgd(&st.ptdump, &init_mm, NULL); + + if (st.wx_pages) + pr_warn("Checked W+X mappings: failed, %lu W+X pages found\n", + st.wx_pages); + else + pr_info("Checked W+X mappings: passed, no W+X pages found\n"); +} + +static int ptdump_show(struct seq_file *m, void *v) +{ + ptdump_walk(m, m->private); + + return 0; +} + +DEFINE_SHOW_ATTRIBUTE(ptdump); + +static int __init ptdump_init(void) +{ + unsigned int i, j; + + address_markers[FIXMAP_START_NR].start_address = FIXADDR_START; + address_markers[FIXMAP_END_NR].start_address = FIXADDR_TOP; + address_markers[PCI_IO_START_NR].start_address = PCI_IO_START; + address_markers[PCI_IO_END_NR].start_address = PCI_IO_END; +#ifdef CONFIG_SPARSEMEM_VMEMMAP + address_markers[VMEMMAP_START_NR].start_address = VMEMMAP_START; + address_markers[VMEMMAP_END_NR].start_address = VMEMMAP_END; +#endif + address_markers[VMALLOC_START_NR].start_address = VMALLOC_START; + address_markers[VMALLOC_END_NR].start_address = VMALLOC_END; + address_markers[PAGE_OFFSET_NR].start_address = PAGE_OFFSET; +#ifdef CONFIG_KASAN + address_markers[KASAN_SHADOW_START_NR].start_address = KASAN_SHADOW_START; + address_markers[KASAN_SHADOW_END_NR].start_address = KASAN_SHADOW_END; +#endif +#ifdef CONFIG_64BIT + address_markers[MODULES_MAPPING_NR].start_address = MODULES_VADDR; + address_markers[KERNEL_MAPPING_NR].start_address = kernel_map.virt_addr; +#endif + + kernel_ptd_info.base_addr = KERN_VIRT_START; + + pg_level[1].name = pgtable_l5_enabled ? "P4D" : "PGD"; + pg_level[2].name = pgtable_l4_enabled ? "PUD" : "PGD"; + + for (i = 0; i < ARRAY_SIZE(pg_level); i++) + for (j = 0; j < ARRAY_SIZE(pte_bits); j++) + pg_level[i].mask |= pte_bits[j].mask; + + debugfs_create_file("kernel_page_tables", 0400, NULL, &kernel_ptd_info, + &ptdump_fops); +#ifdef CONFIG_EFI + if (efi_enabled(EFI_RUNTIME_SERVICES)) + debugfs_create_file("efi_page_tables", 0400, NULL, &efi_ptd_info, + &ptdump_fops); +#endif + + return 0; +} + +device_initcall(ptdump_init); diff --git a/arch/riscv/mm/tlbflush.c b/arch/riscv/mm/tlbflush.c new file mode 100644 index 0000000000..77be59aadc --- /dev/null +++ b/arch/riscv/mm/tlbflush.c @@ -0,0 +1,151 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/mm.h> +#include <linux/smp.h> +#include <linux/sched.h> +#include <asm/sbi.h> +#include <asm/mmu_context.h> + +static inline void local_flush_tlb_all_asid(unsigned long asid) +{ + __asm__ __volatile__ ("sfence.vma x0, %0" + : + : "r" (asid) + : "memory"); +} + +static inline void local_flush_tlb_page_asid(unsigned long addr, + unsigned long asid) +{ + __asm__ __volatile__ ("sfence.vma %0, %1" + : + : "r" (addr), "r" (asid) + : "memory"); +} + +static inline void local_flush_tlb_range(unsigned long start, + unsigned long size, unsigned long stride) +{ + if (size <= stride) + local_flush_tlb_page(start); + else + local_flush_tlb_all(); +} + +static inline void local_flush_tlb_range_asid(unsigned long start, + unsigned long size, unsigned long stride, unsigned long asid) +{ + if (size <= stride) + local_flush_tlb_page_asid(start, asid); + else + local_flush_tlb_all_asid(asid); +} + +static void __ipi_flush_tlb_all(void *info) +{ + local_flush_tlb_all(); +} + +void flush_tlb_all(void) +{ + if (riscv_use_ipi_for_rfence()) + on_each_cpu(__ipi_flush_tlb_all, NULL, 1); + else + sbi_remote_sfence_vma(NULL, 0, -1); +} + +struct flush_tlb_range_data { + unsigned long asid; + unsigned long start; + unsigned long size; + unsigned long stride; +}; + +static void __ipi_flush_tlb_range_asid(void *info) +{ + struct flush_tlb_range_data *d = info; + + local_flush_tlb_range_asid(d->start, d->size, d->stride, d->asid); +} + +static void __ipi_flush_tlb_range(void *info) +{ + struct flush_tlb_range_data *d = info; + + local_flush_tlb_range(d->start, d->size, d->stride); +} + +static void __flush_tlb_range(struct mm_struct *mm, unsigned long start, + unsigned long size, unsigned long stride) +{ + struct flush_tlb_range_data ftd; + struct cpumask *cmask = mm_cpumask(mm); + unsigned int cpuid; + bool broadcast; + + if (cpumask_empty(cmask)) + return; + + cpuid = get_cpu(); + /* check if the tlbflush needs to be sent to other CPUs */ + broadcast = cpumask_any_but(cmask, cpuid) < nr_cpu_ids; + if (static_branch_unlikely(&use_asid_allocator)) { + unsigned long asid = atomic_long_read(&mm->context.id) & asid_mask; + + if (broadcast) { + if (riscv_use_ipi_for_rfence()) { + ftd.asid = asid; + ftd.start = start; + ftd.size = size; + ftd.stride = stride; + on_each_cpu_mask(cmask, + __ipi_flush_tlb_range_asid, + &ftd, 1); + } else + sbi_remote_sfence_vma_asid(cmask, + start, size, asid); + } else { + local_flush_tlb_range_asid(start, size, stride, asid); + } + } else { + if (broadcast) { + if (riscv_use_ipi_for_rfence()) { + ftd.asid = 0; + ftd.start = start; + ftd.size = size; + ftd.stride = stride; + on_each_cpu_mask(cmask, + __ipi_flush_tlb_range, + &ftd, 1); + } else + sbi_remote_sfence_vma(cmask, start, size); + } else { + local_flush_tlb_range(start, size, stride); + } + } + + put_cpu(); +} + +void flush_tlb_mm(struct mm_struct *mm) +{ + __flush_tlb_range(mm, 0, -1, PAGE_SIZE); +} + +void flush_tlb_page(struct vm_area_struct *vma, unsigned long addr) +{ + __flush_tlb_range(vma->vm_mm, addr, PAGE_SIZE, PAGE_SIZE); +} + +void flush_tlb_range(struct vm_area_struct *vma, unsigned long start, + unsigned long end) +{ + __flush_tlb_range(vma->vm_mm, start, end - start, PAGE_SIZE); +} +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +void flush_pmd_tlb_range(struct vm_area_struct *vma, unsigned long start, + unsigned long end) +{ + __flush_tlb_range(vma->vm_mm, start, end - start, PMD_SIZE); +} +#endif |